{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 185.1851851851852, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.018518518518518517, "frac_reward_zero_std": 1.0, "grad_norm": 0.015276861377060413, "kl": 0.00010346825001761317, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 262.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.037037037037037035, "frac_reward_zero_std": 0.0, "grad_norm": 4.382180213928223, "kl": -3.82081061234274e-10, "learning_rate": 5e-09, "loss": 0.1213, "num_tokens": 619.0, "reward": -0.375, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": -0.375, "rewards/reward_combined/std": 1.4361406564712524, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 79.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 79.5, "completions/mean_terminated_length": 20.666667938232422, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.05555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 3.473931312561035, "kl": 0.0008232023101300001, "learning_rate": 1e-08, "loss": 0.2213, "num_tokens": 1157.0, "reward": 1.4249999523162842, "reward_std": 1.8227726221084595, "rewards/reward_combined/mean": 1.4249999523162842, "rewards/reward_combined/std": 1.8227726221084595, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.07407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 10.117016792297363, "kl": 7.714480307186022e-05, "learning_rate": 1.5000000000000002e-08, "loss": 0.0477, "num_tokens": 1394.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.09259259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 3.1065986156463623, "kl": 0.0005084725708002225, "learning_rate": 2e-08, "loss": -0.0048, "num_tokens": 1784.0, "reward": -1.125, "reward_std": 2.136000871658325, "rewards/reward_combined/mean": -1.125, "rewards/reward_combined/std": 2.136000871658325, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1111111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004500226583331823, "kl": 7.377721254897551e-05, "learning_rate": 2.5000000000000002e-08, "loss": 0.0, "num_tokens": 2075.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 144.25, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 32.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.12962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 0.9816138744354248, "kl": 0.00030846422305330634, "learning_rate": 3.0000000000000004e-08, "loss": 0.2044, "num_tokens": 2868.0, "reward": 0.10000002384185791, "reward_std": 2.445403814315796, "rewards/reward_combined/mean": 0.10000002384185791, "rewards/reward_combined/std": 2.445404052734375, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.14814814814814814, "frac_reward_zero_std": 1.0, "grad_norm": 0.016983000561594963, "kl": 0.00023663640308768663, "learning_rate": 3.5e-08, "loss": 0.0, "num_tokens": 3087.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.16666666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006538742454722524, "kl": 5.140900611877441e-06, "learning_rate": 4e-08, "loss": 0.0, "num_tokens": 3299.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.18518518518518517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032633880618959665, "kl": 4.470159183256328e-05, "learning_rate": 4.5e-08, "loss": 0.0, "num_tokens": 3615.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2037037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005074419546872377, "kl": 8.406064353039255e-05, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "num_tokens": 3907.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.2222222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.03197479993104935, "kl": 0.0009412389190401882, "learning_rate": 5.5e-08, "loss": 0.0, "num_tokens": 4140.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.24074074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.304600715637207, "kl": 0.0005858830409124494, "learning_rate": 6.000000000000001e-08, "loss": 0.0358, "num_tokens": 4524.0, "reward": 1.75, "reward_std": 4.941322326660156, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 4.941322326660156, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 95.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 95.0, "completions/mean_terminated_length": 41.333335876464844, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.25925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 1.4981350898742676, "kl": 0.0003046839265152812, "learning_rate": 6.5e-08, "loss": 0.5442, "num_tokens": 5124.0, "reward": 2.174999952316284, "reward_std": 3.998645782470703, "rewards/reward_combined/mean": 2.174999952316284, "rewards/reward_combined/std": 3.998645544052124, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2777777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.028057513758540154, "kl": 0.0009409683407284319, "learning_rate": 7e-08, "loss": 0.0001, "num_tokens": 5420.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.009183578193187714, "kl": 0.00027411559130996466, "learning_rate": 7.500000000000001e-08, "loss": 0.0, "num_tokens": 5707.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.3148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.0409907102584839, "kl": 0.0001727676863083616, "learning_rate": 8e-08, "loss": 0.0274, "num_tokens": 6152.0, "reward": 0.5250000357627869, "reward_std": 1.1786291599273682, "rewards/reward_combined/mean": 0.5250000357627869, "rewards/reward_combined/std": 1.1786291599273682, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.841236114501953, "kl": 0.0001423842113581486, "learning_rate": 8.500000000000001e-08, "loss": 0.0198, "num_tokens": 6431.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.35185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 2.3299195766448975, "kl": 0.0004147829095018096, "learning_rate": 9e-08, "loss": -0.0001, "num_tokens": 6712.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.37037037037037035, "frac_reward_zero_std": 1.0, "grad_norm": 0.004788925871253014, "kl": 7.150529563659802e-05, "learning_rate": 9.5e-08, "loss": 0.0, "num_tokens": 6999.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3888888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.014870761893689632, "kl": 0.00011655241542030126, "learning_rate": 1.0000000000000001e-07, "loss": 0.0, "num_tokens": 7255.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.4074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.9613094329833984, "kl": 0.0003316766378702596, "learning_rate": 1.0500000000000001e-07, "loss": 0.0431, "num_tokens": 7513.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.42592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 4.091280937194824, "kl": 0.00035492231836542487, "learning_rate": 1.1e-07, "loss": 0.0584, "num_tokens": 7803.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.4444444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 11.729456901550293, "kl": 0.0006715949857607484, "learning_rate": 1.1500000000000001e-07, "loss": 0.3658, "num_tokens": 8064.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.46296296296296297, "frac_reward_zero_std": 1.0, "grad_norm": 6.254755930967804e-07, "kl": 0.0, "learning_rate": 1.2000000000000002e-07, "loss": 0.0, "num_tokens": 8284.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.48148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 4.067286968231201, "kl": 0.0004905923269689083, "learning_rate": 1.2500000000000002e-07, "loss": 0.0464, "num_tokens": 8649.0, "reward": -0.625, "reward_std": 2.25, "rewards/reward_combined/mean": -0.625, "rewards/reward_combined/std": 2.25, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.285185813903809, "kl": 0.0005321674689184874, "learning_rate": 1.3e-07, "loss": -0.0051, "num_tokens": 8950.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.5185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 7.33591365814209, "kl": 0.00034890323877334595, "learning_rate": 1.35e-07, "loss": -0.1691, "num_tokens": 9190.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.5370370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.054740507155656815, "kl": 0.0010894648730754852, "learning_rate": 1.4e-07, "loss": 0.0001, "num_tokens": 9406.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5555555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.1007702350616455, "kl": 1.5891260147782305e-06, "learning_rate": 1.4500000000000001e-07, "loss": -0.1501, "num_tokens": 9791.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.5740740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 2.81711745262146, "kl": 0.00015664679813198745, "learning_rate": 1.5000000000000002e-07, "loss": -0.0176, "num_tokens": 10100.0, "reward": 4.375, "reward_std": 4.75, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.75, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.665148735046387, "kl": 0.00013512347868527286, "learning_rate": 1.5500000000000002e-07, "loss": 0.254, "num_tokens": 10381.0, "reward": 5.25, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 3.4034297466278076, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.6111111111111112, "frac_reward_zero_std": 0.0, "grad_norm": 6.739888668060303, "kl": 0.00021044744383402758, "learning_rate": 1.6e-07, "loss": -0.094, "num_tokens": 10638.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.6296296296296297, "frac_reward_zero_std": 0.0, "grad_norm": 6.045888423919678, "kl": 0.0006905360205564648, "learning_rate": 1.65e-07, "loss": 0.1351, "num_tokens": 10930.0, "reward": 1.875, "reward_std": 1.6520190238952637, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 1.6520190238952637, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.6481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03112613968551159, "kl": 0.0005674511194229126, "learning_rate": 1.7000000000000001e-07, "loss": 0.0, "num_tokens": 11138.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 82.5, "completions/mean_terminated_length": 24.666667938232422, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.3666894435882568, "kl": 0.00016910888371057808, "learning_rate": 1.7500000000000002e-07, "loss": 0.4141, "num_tokens": 11696.0, "reward": 5.050000190734863, "reward_std": 5.900000095367432, "rewards/reward_combined/mean": 5.050000190734863, "rewards/reward_combined/std": 5.90000057220459, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.6851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01749885268509388, "kl": 0.00019263103604316711, "learning_rate": 1.8e-07, "loss": 0.0, "num_tokens": 11908.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.7037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.7397072315216064, "kl": 0.00026694054395193234, "learning_rate": 1.85e-07, "loss": 0.089, "num_tokens": 12221.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7222222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003892584063578397, "kl": 8.234895403802511e-06, "learning_rate": 1.9e-07, "loss": 0.0, "num_tokens": 12540.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.7407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 5.948550224304199, "kl": 0.0002654254494700581, "learning_rate": 1.95e-07, "loss": 0.2095, "num_tokens": 12812.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.7592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 3.361522674560547, "kl": 0.0009554388525430113, "learning_rate": 2.0000000000000002e-07, "loss": 0.3914, "num_tokens": 13252.0, "reward": 1.1749999523162842, "reward_std": 1.5564382076263428, "rewards/reward_combined/mean": 1.1749999523162842, "rewards/reward_combined/std": 1.5564383268356323, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.7777777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 5.985459804534912, "kl": 0.000670149689540267, "learning_rate": 2.0500000000000002e-07, "loss": 0.0901, "num_tokens": 13553.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.010001352988183498, "kl": 0.00018666524556465447, "learning_rate": 2.1000000000000003e-07, "loss": 0.0, "num_tokens": 13872.0, "reward": 0.5, "reward_std": 0.0, "rewards/reward_combined/mean": 0.5, "rewards/reward_combined/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 63.75, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 63.75, "completions/mean_terminated_length": 63.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.8148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.169360637664795, "kl": 0.0003253379982197657, "learning_rate": 2.15e-07, "loss": 0.2871, "num_tokens": 14347.0, "reward": 5.675000190734863, "reward_std": 3.139400005340576, "rewards/reward_combined/mean": 5.675000190734863, "rewards/reward_combined/std": 3.139400005340576, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.8333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01016579195857048, "kl": 0.00011322697355353739, "learning_rate": 2.2e-07, "loss": 0.0, "num_tokens": 14617.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.8518518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.333481788635254, "kl": 0.00011563616135390475, "learning_rate": 2.2500000000000002e-07, "loss": 0.0564, "num_tokens": 14911.0, "reward": 4.75, "reward_std": 4.7169904708862305, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 4.7169904708862305, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.8703703703703703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005032677436247468, "kl": 5.104357285290462e-06, "learning_rate": 2.3000000000000002e-07, "loss": 0.0, "num_tokens": 15179.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8888888888888888, "frac_reward_zero_std": 0.0, "grad_norm": 2.153545379638672, "kl": 0.00014264951096265577, "learning_rate": 2.3500000000000003e-07, "loss": 0.0516, "num_tokens": 15509.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 48 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.9074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 6.017335414886475, "kl": 0.0011911321198567748, "learning_rate": 2.4000000000000003e-07, "loss": 0.021, "num_tokens": 15853.0, "reward": 0.25, "reward_std": 1.5, "rewards/reward_combined/mean": 0.25, "rewards/reward_combined/std": 1.5, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9259259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 3.0726490020751953, "kl": 0.000345742839272134, "learning_rate": 2.4500000000000004e-07, "loss": 0.0488, "num_tokens": 16191.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9444444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.02010178565979, "kl": 0.00023531966871814802, "learning_rate": 2.5000000000000004e-07, "loss": 0.049, "num_tokens": 16533.0, "reward": 3.875, "reward_std": 2.9545164108276367, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 2.9545164108276367, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009259259328246117, "clip_ratio/low_min": 0.009259259328246117, "clip_ratio/region_mean": 0.009259259328246117, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.9629629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 5.89812707901001, "kl": 0.0005319359916029498, "learning_rate": 2.55e-07, "loss": 0.2195, "num_tokens": 16841.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.5263047218322754, "kl": 0.00015732928113720845, "learning_rate": 2.6e-07, "loss": 0.0517, "num_tokens": 17154.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 53 }, { "clip_ratio/high_max": 0.001623376621864736, "clip_ratio/high_mean": 0.001623376621864736, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001623376621864736, "completion_length": 97.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 97.5, "completions/mean_terminated_length": 44.66666793823242, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.8974485397338867, "kl": 0.0004155954229645431, "learning_rate": 2.65e-07, "loss": 0.0755, "num_tokens": 17780.0, "reward": -0.574999988079071, "reward_std": 1.9551215171813965, "rewards/reward_combined/mean": -0.574999988079071, "rewards/reward_combined/std": 1.9551215171813965, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 134.5, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 134.5, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.0185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 4.136843681335449, "kl": 0.0003727899747900665, "learning_rate": 2.7e-07, "loss": -0.0251, "num_tokens": 18534.0, "reward": 1.6749999523162842, "reward_std": 4.474650859832764, "rewards/reward_combined/mean": 1.6749999523162842, "rewards/reward_combined/std": 4.474650859832764, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 1.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 7.818897247314453, "kl": 0.0010026985546573997, "learning_rate": 2.75e-07, "loss": 0.0964, "num_tokens": 18864.0, "reward": 0.42500001192092896, "reward_std": 2.255918264389038, "rewards/reward_combined/mean": 0.42500001192092896, "rewards/reward_combined/std": 2.255918025970459, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.0555555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01421457901597023, "kl": 0.00020880624651908875, "learning_rate": 2.8e-07, "loss": 0.0, "num_tokens": 19124.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007260098354890943, "kl": 7.1798758654040284e-06, "learning_rate": 2.85e-07, "loss": 0.0, "num_tokens": 19398.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 1.0925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.8639434576034546, "kl": 0.00014992092474130914, "learning_rate": 2.9000000000000003e-07, "loss": -0.009, "num_tokens": 19752.0, "reward": 1.6749999523162842, "reward_std": 4.925021171569824, "rewards/reward_combined/mean": 1.6749999523162842, "rewards/reward_combined/std": 4.925021171569824, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 1.1111111111111112, "frac_reward_zero_std": 1.0, "grad_norm": 0.007472708821296692, "kl": 0.00019415328279137611, "learning_rate": 2.9500000000000003e-07, "loss": 0.0, "num_tokens": 20052.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.1296296296296295, "frac_reward_zero_std": 0.0, "grad_norm": 5.869415283203125, "kl": 0.000837779080029577, "learning_rate": 3.0000000000000004e-07, "loss": 0.098, "num_tokens": 20330.0, "reward": 5.375, "reward_std": 3.326033592224121, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 3.326033592224121, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.1481481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 10.953880310058594, "kl": 0.0003061145544052124, "learning_rate": 3.0500000000000004e-07, "loss": 0.0643, "num_tokens": 20543.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 1.1666666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.4792659282684326, "kl": 0.00011469334640423767, "learning_rate": 3.1000000000000005e-07, "loss": -0.0007, "num_tokens": 20839.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 1.1851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 3.333653450012207, "kl": 0.00026569234614726156, "learning_rate": 3.15e-07, "loss": -0.1765, "num_tokens": 21112.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.2037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.435307025909424, "kl": 0.0006572658239747398, "learning_rate": 3.2e-07, "loss": -0.2559, "num_tokens": 21528.0, "reward": 5.125, "reward_std": 2.3228933811187744, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 2.3228933811187744, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.2222222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 9.415471868123859e-05, "kl": 2.4978072303838417e-06, "learning_rate": 3.25e-07, "loss": 0.0, "num_tokens": 21892.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 1.2407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 3.40973162651062, "kl": 0.0005845139239681885, "learning_rate": 3.3e-07, "loss": -0.0099, "num_tokens": 22269.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 124.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 80.66667175292969, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.2592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 1.7258747816085815, "kl": 0.0003330958788865246, "learning_rate": 3.35e-07, "loss": 0.6342, "num_tokens": 22995.0, "reward": 0.8500000238418579, "reward_std": 3.4684290885925293, "rewards/reward_combined/mean": 0.8500000238418579, "rewards/reward_combined/std": 3.4684293270111084, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.2777777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01908649317920208, "kl": 0.0003003895326401107, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "num_tokens": 23259.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.2962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.008084496483206749, "kl": 0.00015302672909456305, "learning_rate": 3.4500000000000003e-07, "loss": 0.0, "num_tokens": 23523.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.3148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 4.539731979370117, "kl": 0.0001161574873549398, "learning_rate": 3.5000000000000004e-07, "loss": 0.0048, "num_tokens": 23804.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3333333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04141595587134361, "kl": 0.00022992491722106934, "learning_rate": 3.55e-07, "loss": 0.0, "num_tokens": 24012.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.3518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.013607971370220184, "kl": 0.0002536437605158426, "learning_rate": 3.6e-07, "loss": 0.0, "num_tokens": 24278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.3703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.016140902414917946, "kl": 0.00028977543843211606, "learning_rate": 3.65e-07, "loss": 0.0, "num_tokens": 24534.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 1.3888888888888888, "frac_reward_zero_std": 0.0, "grad_norm": 4.098587512969971, "kl": 0.00014328223369375337, "learning_rate": 3.7e-07, "loss": 0.0031, "num_tokens": 24814.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 86.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 86.5, "completions/mean_terminated_length": 30.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.4074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.5013365745544434, "kl": 0.0005007164436392486, "learning_rate": 3.75e-07, "loss": 0.4328, "num_tokens": 25384.0, "reward": 2.049999952316284, "reward_std": 4.730397701263428, "rewards/reward_combined/mean": 2.049999952316284, "rewards/reward_combined/std": 4.730398178100586, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.9968727827072144, "kl": 0.00015144650024012662, "learning_rate": 3.8e-07, "loss": 0.0802, "num_tokens": 25830.0, "reward": 0.2250000238418579, "reward_std": 1.6459546089172363, "rewards/reward_combined/mean": 0.2250000238418579, "rewards/reward_combined/std": 1.6459547281265259, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.4444444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.004067050293087959, "kl": 0.0001710270080366172, "learning_rate": 3.85e-07, "loss": 0.0, "num_tokens": 26151.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.462962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004056154750287533, "kl": 7.400437334581511e-05, "learning_rate": 3.9e-07, "loss": 0.0, "num_tokens": 26458.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.4814814814814814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005788974813185632, "kl": 1.3294815289555117e-05, "learning_rate": 3.9500000000000003e-07, "loss": 0.0, "num_tokens": 26718.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 1.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.002381294732913375, "kl": 3.735366317414446e-05, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "num_tokens": 26953.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.5185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 4.475809097290039, "kl": 0.0005155188118806109, "learning_rate": 4.0500000000000004e-07, "loss": -0.0334, "num_tokens": 27319.0, "reward": 1.0, "reward_std": 1.632993221282959, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 1.632993221282959, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5370370370370372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006940840394236147, "kl": 1.2978911399841309e-05, "learning_rate": 4.1000000000000004e-07, "loss": 0.0, "num_tokens": 27531.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 91.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 91.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.5555555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.3287602663040161, "kl": 0.0002419799056951888, "learning_rate": 4.1500000000000005e-07, "loss": 0.4604, "num_tokens": 28119.0, "reward": 1.6749999523162842, "reward_std": 4.925021171569824, "rewards/reward_combined/mean": 1.6749999523162842, "rewards/reward_combined/std": 4.925021171569824, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.2970223426818848, "kl": 0.00012435020835255273, "learning_rate": 4.2000000000000006e-07, "loss": 0.073, "num_tokens": 28437.0, "reward": 2.0, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 1.7320507764816284, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 1.5925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.6602261066436768, "kl": 0.00014077619243835215, "learning_rate": 4.2500000000000006e-07, "loss": 0.0885, "num_tokens": 28716.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 1.6111111111111112, "frac_reward_zero_std": 0.0, "grad_norm": 5.254466533660889, "kl": 0.0008489463216392323, "learning_rate": 4.3e-07, "loss": -0.0213, "num_tokens": 29007.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 1.6296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 5.711783409118652, "kl": 0.0005886612052563578, "learning_rate": 4.35e-07, "loss": -0.2294, "num_tokens": 29262.0, "reward": 0.875, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 0.875, "rewards/reward_combined/std": 2.462214469909668, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.6481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002970896603073925, "kl": 2.9337934392970055e-06, "learning_rate": 4.4e-07, "loss": 0.0, "num_tokens": 29561.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 1.6666666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 4.472600936889648, "kl": 0.0006516308058053255, "learning_rate": 4.4500000000000003e-07, "loss": -0.008, "num_tokens": 29872.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 90 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completion_length": 90.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 90.0, "completions/mean_terminated_length": 34.66666793823242, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.6851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 1.860525369644165, "kl": 0.00041001294448506087, "learning_rate": 4.5000000000000003e-07, "loss": 0.3903, "num_tokens": 30448.0, "reward": 2.0, "reward_std": 3.0, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 3.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.7037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.6269596815109253, "kl": 0.00038669158675475046, "learning_rate": 4.5500000000000004e-07, "loss": 0.0183, "num_tokens": 30822.0, "reward": 2.625, "reward_std": 1.108677864074707, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 1.1086779832839966, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 1.7222222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.4100682735443115, "kl": 0.00041439110646024346, "learning_rate": 4.6000000000000004e-07, "loss": -0.0127, "num_tokens": 31157.0, "reward": 4.5, "reward_std": 4.725815773010254, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.725815773010254, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.7407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 2.267754554748535, "kl": 0.000419690681155771, "learning_rate": 4.6500000000000005e-07, "loss": -0.0801, "num_tokens": 31534.0, "reward": 1.125, "reward_std": 1.25, "rewards/reward_combined/mean": 1.125, "rewards/reward_combined/std": 1.25, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.25, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 1.7592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 3.107111930847168, "kl": 0.0005589666543528438, "learning_rate": 4.7000000000000005e-07, "loss": -0.033, "num_tokens": 31951.0, "reward": 1.625, "reward_std": 4.479118347167969, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 4.479118347167969, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.7777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 10.471097946166992, "kl": 0.0002822727024067717, "learning_rate": 4.7500000000000006e-07, "loss": 0.0053, "num_tokens": 32170.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.7962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 1.712054836389143e-05, "kl": 2.5704503059387207e-07, "learning_rate": 4.800000000000001e-07, "loss": 0.0, "num_tokens": 32390.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 1.8148148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.010497132316231728, "kl": 9.453139136894606e-05, "learning_rate": 4.85e-07, "loss": 0.0, "num_tokens": 32639.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 1.8333333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 3.6569571495056152, "kl": 0.00014023463882040232, "learning_rate": 4.900000000000001e-07, "loss": -0.0355, "num_tokens": 32927.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.8518518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 2.6258456707000732, "kl": 0.0005591202643699944, "learning_rate": 4.95e-07, "loss": -0.0597, "num_tokens": 33297.0, "reward": 1.25, "reward_std": 3.0686588287353516, "rewards/reward_combined/mean": 1.25, "rewards/reward_combined/std": 3.0686588287353516, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 74.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 74.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 1.8703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 2.39278507232666, "kl": 0.00016994951874949038, "learning_rate": 5.000000000000001e-07, "loss": 0.5082, "num_tokens": 33827.0, "reward": 4.300000190734863, "reward_std": 4.284857273101807, "rewards/reward_combined/mean": 4.300000190734863, "rewards/reward_combined/std": 4.284857273101807, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 1.8888888888888888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018180354963988066, "kl": 2.8094106255593942e-05, "learning_rate": 5.05e-07, "loss": 0.0, "num_tokens": 34127.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 1.9074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.7861289978027344, "kl": 0.0001018730672512902, "learning_rate": 5.1e-07, "loss": 0.1095, "num_tokens": 34511.0, "reward": 3.5, "reward_std": 5.446711540222168, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 5.446711540222168, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006408070912584662, "kl": 5.133450031280518e-06, "learning_rate": 5.15e-07, "loss": 0.0, "num_tokens": 34747.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.9444444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 6.733788013458252, "kl": 0.0008390087750740349, "learning_rate": 5.2e-07, "loss": 0.2134, "num_tokens": 35042.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017241379246115685, "clip_ratio/low_min": 0.017241379246115685, "clip_ratio/region_mean": 0.017241379246115685, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 1.9629629629629628, "frac_reward_zero_std": 0.0, "grad_norm": 4.298315048217773, "kl": 0.0003872589295497164, "learning_rate": 5.250000000000001e-07, "loss": 0.1387, "num_tokens": 35321.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 1.9814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 2.3659603595733643, "kl": 0.0005509029579116032, "learning_rate": 5.3e-07, "loss": 0.0349, "num_tokens": 35638.0, "reward": 2.125, "reward_std": 4.2303466796875, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 4.2303466796875, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 2.0, "frac_reward_zero_std": 0.0, "grad_norm": 5.522470474243164, "kl": 0.001075445325113833, "learning_rate": 5.350000000000001e-07, "loss": -0.1648, "num_tokens": 36003.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 2.0185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 4.32027530670166, "kl": 0.0003525819738570135, "learning_rate": 5.4e-07, "loss": 0.0639, "num_tokens": 36299.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011111111380159855, "clip_ratio/low_min": 0.011111111380159855, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.516188621520996, "kl": 0.0016761224251240492, "learning_rate": 5.450000000000001e-07, "loss": 0.054, "num_tokens": 36622.0, "reward": 2.299999952316284, "reward_std": 3.802630662918091, "rewards/reward_combined/mean": 2.299999952316284, "rewards/reward_combined/std": 3.8026304244995117, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 2.0555555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 1.7964736223220825, "kl": 0.00013105083053233102, "learning_rate": 5.5e-07, "loss": 0.0186, "num_tokens": 37049.0, "reward": 0.800000011920929, "reward_std": 1.5033296346664429, "rewards/reward_combined/mean": 0.800000011920929, "rewards/reward_combined/std": 1.5033297538757324, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.186060428619385, "kl": 0.0004813349078176543, "learning_rate": 5.550000000000001e-07, "loss": 0.1532, "num_tokens": 37400.0, "reward": 2.875, "reward_std": 3.8810436725616455, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.8810436725616455, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 2.0925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.489348888397217, "kl": 0.0006203134835232049, "learning_rate": 5.6e-07, "loss": -0.0507, "num_tokens": 37616.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 100.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 100.75, "completions/mean_terminated_length": 49.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.4407665729522705, "kl": 0.0005087032477604225, "learning_rate": 5.650000000000001e-07, "loss": 0.2001, "num_tokens": 38235.0, "reward": 2.424999952316284, "reward_std": 4.530176639556885, "rewards/reward_combined/mean": 2.424999952316284, "rewards/reward_combined/std": 4.530176639556885, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.1296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 5.9469733238220215, "kl": 0.0004716240619018208, "learning_rate": 5.7e-07, "loss": 0.0817, "num_tokens": 38509.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.382570743560791, "kl": 0.00018100748275173828, "learning_rate": 5.750000000000001e-07, "loss": -0.0174, "num_tokens": 38841.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 100.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 100.5, "completions/mean_terminated_length": 48.66666793823242, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 2.1666666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 2.1383755207061768, "kl": 0.0005466157454065979, "learning_rate": 5.800000000000001e-07, "loss": -0.3445, "num_tokens": 39495.0, "reward": -0.19999998807907104, "reward_std": 1.5684387683868408, "rewards/reward_combined/mean": -0.19999998807907104, "rewards/reward_combined/std": 1.5684387683868408, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 2.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.38777232170105, "kl": 0.00025938585167750716, "learning_rate": 5.850000000000001e-07, "loss": -0.0088, "num_tokens": 39849.0, "reward": 2.875, "reward_std": 5.375484466552734, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 5.375484466552734, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 2.2037037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.001290779560804367, "kl": 5.739182233810425e-05, "learning_rate": 5.900000000000001e-07, "loss": 0.0, "num_tokens": 40138.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 2.2222222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014208944514393806, "kl": 0.00026482938574190484, "learning_rate": 5.95e-07, "loss": 0.0, "num_tokens": 40392.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.75, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 42.75, "completions/mean_terminated_length": 42.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.240740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 2.1715030670166016, "kl": 0.0004549708100967109, "learning_rate": 6.000000000000001e-07, "loss": 0.2571, "num_tokens": 40783.0, "reward": 2.625, "reward_std": 3.25, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 3.25, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 2.259259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.013272075913846493, "kl": 0.00013608485460281372, "learning_rate": 6.05e-07, "loss": 0.0, "num_tokens": 40995.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 2.2777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 11.347054481506348, "kl": 0.0009227428381564096, "learning_rate": 6.100000000000001e-07, "loss": 0.0367, "num_tokens": 41259.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.9311050176620483, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 143.5, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 31.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 2.2962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.8570256233215332, "kl": 0.0002119517303071916, "learning_rate": 6.15e-07, "loss": -0.0045, "num_tokens": 42069.0, "reward": -1.4000000953674316, "reward_std": 2.5651512145996094, "rewards/reward_combined/mean": -1.4000000953674316, "rewards/reward_combined/std": 2.5651512145996094, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 2.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.1780805587768555, "kl": 0.00013237333405413665, "learning_rate": 6.200000000000001e-07, "loss": -0.0765, "num_tokens": 42353.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.3333333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.005459534004330635, "kl": 9.964062701328658e-05, "learning_rate": 6.25e-07, "loss": 0.0, "num_tokens": 42637.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 2.351851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.3050265312194824, "kl": 6.153931099106558e-05, "learning_rate": 6.3e-07, "loss": 0.0268, "num_tokens": 42951.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 85.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 85.75, "completions/mean_terminated_length": 29.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 2.3703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 2.593751907348633, "kl": 0.00013137029691279167, "learning_rate": 6.350000000000001e-07, "loss": 0.0259, "num_tokens": 43514.0, "reward": 2.049999952316284, "reward_std": 3.652852773666382, "rewards/reward_combined/mean": 2.049999952316284, "rewards/reward_combined/std": 3.652852773666382, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.388888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 10.23281478881836, "kl": 0.0008239075541496277, "learning_rate": 6.4e-07, "loss": 0.0092, "num_tokens": 43758.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 2.4074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.093120098114014, "kl": 0.0007498844061046839, "learning_rate": 6.450000000000001e-07, "loss": 0.0462, "num_tokens": 44102.0, "reward": 1.25, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 1.25, "rewards/reward_combined/std": 1.190238118171692, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.010830286890268326, "kl": 0.00012982413500139955, "learning_rate": 6.5e-07, "loss": 0.0, "num_tokens": 44372.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.017241379246115685, "clip_ratio/high_mean": 0.017241379246115685, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017241379246115685, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 2.4444444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 4.542089462280273, "kl": 0.00022038919269107282, "learning_rate": 6.550000000000001e-07, "loss": 0.0533, "num_tokens": 44637.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.711742877960205, "kl": 8.712472117622383e-05, "learning_rate": 6.6e-07, "loss": 0.0333, "num_tokens": 44927.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 77.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 77.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.4814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 2.2106378078460693, "kl": 0.00033674850419629365, "learning_rate": 6.650000000000001e-07, "loss": 0.4628, "num_tokens": 45461.0, "reward": 3.674999952316284, "reward_std": 5.328148365020752, "rewards/reward_combined/mean": 3.674999952316284, "rewards/reward_combined/std": 5.328148365020752, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.006199777591973543, "kl": 8.84437125137083e-05, "learning_rate": 6.7e-07, "loss": 0.0, "num_tokens": 45737.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 74.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 74.75, "completions/mean_terminated_length": 14.333333969116211, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.5185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 2.7455718517303467, "kl": 0.0006337195663945749, "learning_rate": 6.750000000000001e-07, "loss": 0.4656, "num_tokens": 46256.0, "reward": 6.300000190734863, "reward_std": 2.3999998569488525, "rewards/reward_combined/mean": 6.300000190734863, "rewards/reward_combined/std": 2.3999998569488525, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.537037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 7.123297691345215, "kl": 0.0015466511249542236, "learning_rate": 6.800000000000001e-07, "loss": -0.0219, "num_tokens": 46517.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 2.5555555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 11.34954833984375, "kl": 0.00038402079371735454, "learning_rate": 6.850000000000001e-07, "loss": 0.2908, "num_tokens": 46764.0, "reward": 2.5, "reward_std": 3.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 2.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01095586083829403, "kl": 0.00015586709923809394, "learning_rate": 6.900000000000001e-07, "loss": 0.0, "num_tokens": 47032.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 2.5925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017439624934922904, "kl": 1.2889504432678223e-05, "learning_rate": 6.950000000000001e-07, "loss": 0.0, "num_tokens": 47244.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.611111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.713172674179077, "kl": 0.0003980778274126351, "learning_rate": 7.000000000000001e-07, "loss": -0.0583, "num_tokens": 47532.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 2.6296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 3.467461585998535, "kl": 0.0004428433512657648, "learning_rate": 7.05e-07, "loss": -0.0353, "num_tokens": 47811.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 2.648148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.4800190925598145, "kl": 0.0005975357198622078, "learning_rate": 7.1e-07, "loss": 0.0018, "num_tokens": 48139.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.6666666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.013862133957445621, "kl": 0.00023708815569989383, "learning_rate": 7.15e-07, "loss": 0.0, "num_tokens": 48401.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 2.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.7972193956375122, "kl": 0.0010266218159813434, "learning_rate": 7.2e-07, "loss": -0.0007, "num_tokens": 48732.0, "reward": 4.375, "reward_std": 4.75, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.75, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.7037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.4336752891540527, "kl": 9.669694009062368e-05, "learning_rate": 7.25e-07, "loss": 0.061, "num_tokens": 49032.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 2.7222222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014237516559660435, "kl": 0.0001291364405915374, "learning_rate": 7.3e-07, "loss": 0.0, "num_tokens": 49288.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.7407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010676102101569995, "kl": 3.3639371395111084e-06, "learning_rate": 7.350000000000001e-07, "loss": 0.0, "num_tokens": 49652.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 2.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.2932288646698, "kl": 0.00010290895806974731, "learning_rate": 7.4e-07, "loss": 0.1893, "num_tokens": 49953.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 79.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 79.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 2.7777777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.027895966544747353, "kl": 0.0005462750559672713, "learning_rate": 7.450000000000001e-07, "loss": 0.0, "num_tokens": 50492.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 2.7962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.1199541091918945, "kl": 0.0008081686173682101, "learning_rate": 7.5e-07, "loss": 0.1099, "num_tokens": 50788.0, "reward": 7.0, "reward_std": 2.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 2.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008474576286971569, "clip_ratio/low_min": 0.008474576286971569, "clip_ratio/region_mean": 0.008474576286971569, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 2.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.599039316177368, "kl": 0.0014945815491955727, "learning_rate": 7.550000000000001e-07, "loss": 0.0254, "num_tokens": 51127.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.8333333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 4.152106285095215, "kl": 0.0003815448144450784, "learning_rate": 7.6e-07, "loss": 0.2177, "num_tokens": 51480.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 2.851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.002794083673506975, "kl": 2.3503194825025275e-05, "learning_rate": 7.650000000000001e-07, "loss": 0.0, "num_tokens": 51776.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.8703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.007637332659214735, "kl": 8.353089606316644e-05, "learning_rate": 7.7e-07, "loss": 0.0, "num_tokens": 52096.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 2.888888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 1.6680789485690184e-05, "kl": 2.0116567611694336e-07, "learning_rate": 7.750000000000001e-07, "loss": 0.0, "num_tokens": 52316.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 2.9074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031553194858133793, "kl": 3.410237241041614e-05, "learning_rate": 7.8e-07, "loss": 0.0, "num_tokens": 52552.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01248088013380766, "kl": 0.0002267432282678783, "learning_rate": 7.850000000000001e-07, "loss": 0.0, "num_tokens": 52817.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 2.9444444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 6.774168014526367, "kl": 0.0009432996848772746, "learning_rate": 7.900000000000001e-07, "loss": 0.083, "num_tokens": 53039.0, "reward": 2.5, "reward_std": 3.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 2.962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.76754093170166, "kl": 0.0003742112312465906, "learning_rate": 7.950000000000001e-07, "loss": 0.0115, "num_tokens": 53372.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 2.9814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 7.528498649597168, "kl": 0.0005340461502783, "learning_rate": 8.000000000000001e-07, "loss": 0.0026, "num_tokens": 53644.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.013261830434203148, "kl": 0.00048185289779212326, "learning_rate": 8.050000000000001e-07, "loss": 0.0, "num_tokens": 53958.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 3.0185185185185186, "frac_reward_zero_std": 1.0, "grad_norm": 0.012099964544177055, "kl": 0.00014674800331704319, "learning_rate": 8.100000000000001e-07, "loss": 0.0, "num_tokens": 54237.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.037037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03482325002551079, "kl": 0.0004571895260596648, "learning_rate": 8.150000000000001e-07, "loss": 0.0, "num_tokens": 54568.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.0555555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.026677284389734268, "kl": 0.00038430988206528127, "learning_rate": 8.200000000000001e-07, "loss": 0.0, "num_tokens": 54841.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.690991401672363, "kl": 0.00023384806263493374, "learning_rate": 8.250000000000001e-07, "loss": 0.0205, "num_tokens": 55113.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 83.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 83.25, "completions/mean_terminated_length": 25.666667938232422, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 3.0925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.141160726547241, "kl": 0.0006422503502108157, "learning_rate": 8.300000000000001e-07, "loss": 0.4264, "num_tokens": 55694.0, "reward": 0.800000011920929, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 0.800000011920929, "rewards/reward_combined/std": 0.4000000059604645, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 3.111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 10.635080337524414, "kl": 0.000193272324395366, "learning_rate": 8.350000000000002e-07, "loss": 0.1262, "num_tokens": 55917.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.1296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 3.533308982849121, "kl": 0.0006996378433541395, "learning_rate": 8.400000000000001e-07, "loss": 0.0212, "num_tokens": 56199.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.148148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0049111791886389256, "kl": 0.00010645241127349436, "learning_rate": 8.450000000000002e-07, "loss": 0.0, "num_tokens": 56459.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 3.1666666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 12.71115779876709, "kl": 0.00027540007431525737, "learning_rate": 8.500000000000001e-07, "loss": 0.145, "num_tokens": 56698.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.089034080505371, "kl": 0.0002739095871220343, "learning_rate": 8.550000000000002e-07, "loss": -0.0199, "num_tokens": 56983.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.25, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 75.25, "completions/mean_terminated_length": 75.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.2037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.7757223844528198, "kl": 0.0007941615185700357, "learning_rate": 8.6e-07, "loss": 0.4436, "num_tokens": 57524.0, "reward": 2.549999952316284, "reward_std": 3.652852773666382, "rewards/reward_combined/mean": 2.549999952316284, "rewards/reward_combined/std": 3.6528525352478027, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.2222222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.3837592601776123, "kl": 0.0001371638645650819, "learning_rate": 8.65e-07, "loss": 0.1302, "num_tokens": 57828.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 3.240740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.020136449486017227, "kl": 0.0008117705583572388, "learning_rate": 8.7e-07, "loss": 0.0, "num_tokens": 58044.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 3.259259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028178910724818707, "kl": 0.000139241532451706, "learning_rate": 8.75e-07, "loss": 0.0, "num_tokens": 58372.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.2777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.711977481842041, "kl": 0.0005000726814614609, "learning_rate": 8.8e-07, "loss": 0.0955, "num_tokens": 58648.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 3.2962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.4448676109313965, "kl": 0.0010784152545966208, "learning_rate": 8.85e-07, "loss": -0.0356, "num_tokens": 58976.0, "reward": 1.875, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 1.4361406564712524, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 3.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.2295610904693604, "kl": 0.0003543650091160089, "learning_rate": 8.900000000000001e-07, "loss": -0.0352, "num_tokens": 59326.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 3.3333333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 5.431831359863281, "kl": 0.003365610959008336, "learning_rate": 8.95e-07, "loss": -0.0675, "num_tokens": 59654.0, "reward": 0.0, "reward_std": 2.6140644550323486, "rewards/reward_combined/mean": 0.0, "rewards/reward_combined/std": 2.6140644550323486, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 75.25, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.351851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.593400478363037, "kl": 0.00022540460486197844, "learning_rate": 9.000000000000001e-07, "loss": 0.4783, "num_tokens": 60179.0, "reward": 3.125, "reward_std": 5.05593729019165, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 5.05593729019165, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 3.3703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.003122697351500392, "kl": 2.2100284695625305e-05, "learning_rate": 9.05e-07, "loss": 0.0, "num_tokens": 60423.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.388888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.675969362258911, "kl": 0.0005094165680930018, "learning_rate": 9.100000000000001e-07, "loss": 0.0508, "num_tokens": 60786.0, "reward": 4.75, "reward_std": 3.278719186782837, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.278719186782837, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 3.4074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.002446018159389496, "kl": 3.299116906418931e-05, "learning_rate": 9.15e-07, "loss": 0.0, "num_tokens": 61078.0, "reward": 0.5, "reward_std": 0.0, "rewards/reward_combined/mean": 0.5, "rewards/reward_combined/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.647721290588379, "kl": 0.00037025466463092016, "learning_rate": 9.200000000000001e-07, "loss": -0.0949, "num_tokens": 61395.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.4444444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 4.454507827758789, "kl": 0.0007791472307872027, "learning_rate": 9.25e-07, "loss": 0.1284, "num_tokens": 61706.0, "reward": 5.375, "reward_std": 4.25, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 4.25, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 3.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.8904082775115967, "kl": 0.0006630937859881669, "learning_rate": 9.300000000000001e-07, "loss": 0.0012, "num_tokens": 62103.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.4814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 3.454972505569458, "kl": 0.00026252575480611995, "learning_rate": 9.35e-07, "loss": 0.0147, "num_tokens": 62373.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.012308129109442234, "kl": 0.00014328781981021166, "learning_rate": 9.400000000000001e-07, "loss": 0.0, "num_tokens": 62643.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 99.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 99.75, "completions/mean_terminated_length": 47.66666793823242, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 3.5185185185185186, "frac_reward_zero_std": 0.0, "grad_norm": 2.006739854812622, "kl": 0.00038072570168878883, "learning_rate": 9.450000000000001e-07, "loss": 0.0963, "num_tokens": 63262.0, "reward": -0.699999988079071, "reward_std": 1.4236104488372803, "rewards/reward_combined/mean": -0.699999988079071, "rewards/reward_combined/std": 1.4236104488372803, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 3.537037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.305161237716675, "kl": 0.00011003762483596802, "learning_rate": 9.500000000000001e-07, "loss": 0.0139, "num_tokens": 63575.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.5555555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.008766883984208107, "kl": 0.00026821778737939894, "learning_rate": 9.550000000000002e-07, "loss": 0.0, "num_tokens": 63877.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 3.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.036961473524570465, "kl": 0.0004311874508857727, "learning_rate": 9.600000000000001e-07, "loss": 0.0, "num_tokens": 64087.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 3.5925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.012454986572266, "kl": 5.3485324315261096e-05, "learning_rate": 9.65e-07, "loss": 0.033, "num_tokens": 64377.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 3.611111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.7620856761932373, "kl": 0.0007726932817604393, "learning_rate": 9.7e-07, "loss": -0.0169, "num_tokens": 64734.0, "reward": 1.875, "reward_std": 1.314977765083313, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 1.3149778842926025, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 3.6296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 3.593531370162964, "kl": 0.001955812331289053, "learning_rate": 9.750000000000002e-07, "loss": 0.045, "num_tokens": 65147.0, "reward": -0.9500000476837158, "reward_std": 2.245736837387085, "rewards/reward_combined/mean": -0.9500000476837158, "rewards/reward_combined/std": 2.245736837387085, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.648148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.015462894923985004, "kl": 0.0002474366265232675, "learning_rate": 9.800000000000001e-07, "loss": 0.0, "num_tokens": 65411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 3.6666666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.020246418192982674, "kl": 0.0002346962719457224, "learning_rate": 9.85e-07, "loss": 0.0, "num_tokens": 65624.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.685185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.011408284306526184, "kl": 0.00020903507902403362, "learning_rate": 9.9e-07, "loss": 0.0, "num_tokens": 65935.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 3.7037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 13.419598579406738, "kl": 0.014389432966709137, "learning_rate": 9.950000000000002e-07, "loss": 0.1545, "num_tokens": 66149.0, "reward": 2.0, "reward_std": 3.0, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 3.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.7222222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 8.99533333722502e-05, "kl": 4.1158248222927796e-06, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "num_tokens": 66513.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 3.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.0003652572631836, "kl": 0.0003467664573690854, "learning_rate": 1.0050000000000001e-06, "loss": -0.1055, "num_tokens": 66911.0, "reward": 3.5, "reward_std": 4.2031731605529785, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 4.203173637390137, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 3.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 10.411017417907715, "kl": 0.00024033657246036455, "learning_rate": 1.01e-06, "loss": 0.0455, "num_tokens": 67146.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.7777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.719759702682495, "kl": 0.0003463800239842385, "learning_rate": 1.0150000000000002e-06, "loss": 0.0918, "num_tokens": 67429.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 3.7962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04005039855837822, "kl": 0.0010209515221504262, "learning_rate": 1.02e-06, "loss": 0.0, "num_tokens": 67648.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.016322951763868332, "kl": 0.0006170868000481278, "learning_rate": 1.025e-06, "loss": 0.0, "num_tokens": 67974.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 3.8333333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 2.8814539909362793, "kl": 0.00048515624075662345, "learning_rate": 1.03e-06, "loss": 0.3764, "num_tokens": 68383.0, "reward": 3.125, "reward_std": 0.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 0.75, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.015571532770991325, "kl": 0.00042310013668611646, "learning_rate": 1.035e-06, "loss": 0.0, "num_tokens": 68645.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.8703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 3.808192253112793, "kl": 0.006432068039430305, "learning_rate": 1.04e-06, "loss": 0.0308, "num_tokens": 68924.0, "reward": 5.5, "reward_std": 5.0, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 5.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 86.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 86.5, "completions/mean_terminated_length": 30.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 3.888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.2043819427490234, "kl": 0.0004358821752248332, "learning_rate": 1.045e-06, "loss": 0.4026, "num_tokens": 69494.0, "reward": 5.050000190734863, "reward_std": 5.238638401031494, "rewards/reward_combined/mean": 5.050000190734863, "rewards/reward_combined/std": 5.238638401031494, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.9074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.024295704439282417, "kl": 0.00039498626392742153, "learning_rate": 1.0500000000000001e-06, "loss": 0.0, "num_tokens": 69797.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 3.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00828875508159399, "kl": 0.0001855584378063213, "learning_rate": 1.055e-06, "loss": 0.0, "num_tokens": 70055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.019999999552965164, "clip_ratio/low_min": 0.019999999552965164, "clip_ratio/region_mean": 0.019999999552965164, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.9444444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 4.292230606079102, "kl": 0.0005158344574738294, "learning_rate": 1.06e-06, "loss": 0.0573, "num_tokens": 70330.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 3.962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.083495140075684, "kl": 0.009145412223006133, "learning_rate": 1.065e-06, "loss": 0.0435, "num_tokens": 70588.0, "reward": 2.5, "reward_std": 2.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 2.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 3.9814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 5.505530834197998, "kl": 0.00036180300230626017, "learning_rate": 1.0700000000000001e-06, "loss": 0.1297, "num_tokens": 70897.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 96.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 96.75, "completions/mean_terminated_length": 43.66666793823242, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 4.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.4811878204345703, "kl": 0.0006696357449982315, "learning_rate": 1.075e-06, "loss": 0.4353, "num_tokens": 71520.0, "reward": 2.174999952316284, "reward_std": 4.925021171569824, "rewards/reward_combined/mean": 2.174999952316284, "rewards/reward_combined/std": 4.925021171569824, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.018518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 6.149474620819092, "kl": 0.0006648863054579124, "learning_rate": 1.08e-06, "loss": -0.0079, "num_tokens": 71827.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.5, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.6049511432647705, "kl": 0.0005178407882340252, "learning_rate": 1.085e-06, "loss": -0.2799, "num_tokens": 72249.0, "reward": 3.125, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.4361406564712524, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 4.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.00639579351991415, "kl": 0.0002965182065963745, "learning_rate": 1.0900000000000002e-06, "loss": 0.0, "num_tokens": 72461.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 82.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 4.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.109419822692871, "kl": 0.007602300436701626, "learning_rate": 1.095e-06, "loss": 0.3245, "num_tokens": 73017.0, "reward": 0.5499999523162842, "reward_std": 5.116313934326172, "rewards/reward_combined/mean": 0.5499999523162842, "rewards/reward_combined/std": 5.116313934326172, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 8.467757225036621, "kl": 0.001044327742420137, "learning_rate": 1.1e-06, "loss": -0.017, "num_tokens": 73254.0, "reward": 2.125, "reward_std": 1.8874585628509521, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 1.8874585628509521, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.111111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.016274528577923775, "kl": 0.0007994174957275391, "learning_rate": 1.105e-06, "loss": 0.0, "num_tokens": 73490.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 7.802985191345215, "kl": 0.0009520261955913156, "learning_rate": 1.1100000000000002e-06, "loss": 0.1421, "num_tokens": 73828.0, "reward": 4.125, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.308422088623047, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 4.148148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.007984772324562073, "kl": 0.0005319621413946152, "learning_rate": 1.1150000000000001e-06, "loss": 0.0, "num_tokens": 74088.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009999999776482582, "clip_ratio/low_min": 0.009999999776482582, "clip_ratio/region_mean": 0.009999999776482582, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 4.166666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 6.037923336029053, "kl": 0.0023668191861361265, "learning_rate": 1.12e-06, "loss": 0.0291, "num_tokens": 74408.0, "reward": 1.875, "reward_std": 1.314977765083313, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 1.3149778842926025, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.785388708114624, "kl": 0.0006359066173899919, "learning_rate": 1.125e-06, "loss": 0.0166, "num_tokens": 74723.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 226 }, { "clip_ratio/high_max": 0.006849315017461777, "clip_ratio/high_mean": 0.006849315017461777, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006849315017461777, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.203703703703703, "frac_reward_zero_std": 0.0, "grad_norm": 3.976260185241699, "kl": 0.0013426027144305408, "learning_rate": 1.1300000000000002e-06, "loss": -0.1895, "num_tokens": 75039.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 77.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 77.0, "completions/mean_terminated_length": 17.33333396911621, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.222222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 1.7177538871765137, "kl": 0.0006136625452199951, "learning_rate": 1.1350000000000001e-06, "loss": 0.453, "num_tokens": 75567.0, "reward": 4.675000190734863, "reward_std": 5.650000095367432, "rewards/reward_combined/mean": 4.675000190734863, "rewards/reward_combined/std": 5.650000095367432, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 99.5, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 99.5, "completions/mean_terminated_length": 99.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 4.2407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.6025264263153076, "kl": 0.0014317751047201455, "learning_rate": 1.14e-06, "loss": -0.0071, "num_tokens": 76201.0, "reward": -0.375, "reward_std": 1.314977765083313, "rewards/reward_combined/mean": -0.375, "rewards/reward_combined/std": 1.3149778842926025, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.2592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.1561126708984375, "kl": 0.0006415839670808055, "learning_rate": 1.145e-06, "loss": -0.0392, "num_tokens": 76533.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 105.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 105.0, "completions/mean_terminated_length": 54.66666793823242, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 4.277777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.7575879096984863, "kl": 0.0007159417145885527, "learning_rate": 1.1500000000000002e-06, "loss": 0.2664, "num_tokens": 77173.0, "reward": -0.07500004768371582, "reward_std": 3.053277015686035, "rewards/reward_combined/mean": -0.07500004768371582, "rewards/reward_combined/std": 3.0532772541046143, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 4.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.8607497215271, "kl": 0.0006866858748253435, "learning_rate": 1.1550000000000002e-06, "loss": 0.3258, "num_tokens": 77461.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 4.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.004494777414947748, "kl": 8.15409621282015e-05, "learning_rate": 1.1600000000000001e-06, "loss": 0.0, "num_tokens": 77768.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 4.333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.0760514736175537, "kl": 0.000535394181497395, "learning_rate": 1.165e-06, "loss": 0.0141, "num_tokens": 78051.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.351851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.032981693744659424, "kl": 0.000625381464487873, "learning_rate": 1.1700000000000002e-06, "loss": 0.0, "num_tokens": 78317.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 4.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.2913496494293213, "kl": 0.00020902293908875436, "learning_rate": 1.175e-06, "loss": 0.0099, "num_tokens": 78684.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.388888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02641241066157818, "kl": 0.000496070584631525, "learning_rate": 1.1800000000000001e-06, "loss": 0.0, "num_tokens": 78949.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 9.839051926974207e-05, "kl": 1.7657876014709473e-06, "learning_rate": 1.185e-06, "loss": 0.0, "num_tokens": 79169.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 4.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 9.08411979675293, "kl": 0.0004420115437824279, "learning_rate": 1.19e-06, "loss": 0.228, "num_tokens": 79434.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.011137240566313267, "kl": 0.0001389548233419191, "learning_rate": 1.195e-06, "loss": 0.0, "num_tokens": 79690.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 4.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.5040652751922607, "kl": 0.0005904131976421922, "learning_rate": 1.2000000000000002e-06, "loss": 0.2253, "num_tokens": 80054.0, "reward": 1.875, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 4.308422088623047, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.481481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.02287081442773342, "kl": 0.0003317360387882218, "learning_rate": 1.2050000000000001e-06, "loss": 0.0, "num_tokens": 80328.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 91.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 91.75, "completions/mean_terminated_length": 37.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 4.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.798316478729248, "kl": 0.0014671966200694442, "learning_rate": 1.21e-06, "loss": 0.2304, "num_tokens": 80947.0, "reward": 0.550000011920929, "reward_std": 1.9604421854019165, "rewards/reward_combined/mean": 0.550000011920929, "rewards/reward_combined/std": 1.960442066192627, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 93.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 93.75, "completions/mean_terminated_length": 39.66666793823242, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 4.518518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 1.0835953950881958, "kl": 0.000426981074269861, "learning_rate": 1.215e-06, "loss": -0.0659, "num_tokens": 81602.0, "reward": 1.7750000953674316, "reward_std": 0.8500000238418579, "rewards/reward_combined/mean": 1.7750000953674316, "rewards/reward_combined/std": 0.8500000238418579, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 4.537037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.184598207473755, "kl": 0.00022651396284345537, "learning_rate": 1.2200000000000002e-06, "loss": -0.0022, "num_tokens": 81835.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 4.555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 3.432215929031372, "kl": 0.028317431453615427, "learning_rate": 1.2250000000000001e-06, "loss": 0.057, "num_tokens": 82145.0, "reward": 2.5, "reward_std": 4.3011627197265625, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 4.3011627197265625, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.036012172698975, "kl": 0.001386194402584806, "learning_rate": 1.23e-06, "loss": -0.1551, "num_tokens": 82470.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 4.063658237457275, "kl": 0.0006246385164558887, "learning_rate": 1.235e-06, "loss": 0.0004, "num_tokens": 82775.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 4.611111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08718718588352203, "kl": 0.0023730514221824706, "learning_rate": 1.2400000000000002e-06, "loss": 0.0001, "num_tokens": 83047.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 9.658737182617188, "kl": 0.0033156605204567313, "learning_rate": 1.2450000000000002e-06, "loss": 0.2768, "num_tokens": 83273.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 4.648148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 8.6264066696167, "kl": 0.00041886974941007793, "learning_rate": 1.25e-06, "loss": -0.0197, "num_tokens": 83543.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.666666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.024177437648177147, "kl": 0.0007547425921075046, "learning_rate": 1.255e-06, "loss": 0.0, "num_tokens": 83815.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.272758960723877, "kl": 0.00039277435280382633, "learning_rate": 1.26e-06, "loss": 0.0173, "num_tokens": 84101.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 4.703703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.013816445134580135, "kl": 0.0003852471709251404, "learning_rate": 1.2650000000000002e-06, "loss": 0.0, "num_tokens": 84307.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.016129031777381897, "clip_ratio/high_mean": 0.016129031777381897, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016129031777381897, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.722222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 6.735982894897461, "kl": 0.001569026499055326, "learning_rate": 1.2700000000000001e-06, "loss": 0.0742, "num_tokens": 84578.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 4.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 4.9526448249816895, "kl": 0.0008575281972298399, "learning_rate": 1.275e-06, "loss": -0.0123, "num_tokens": 84905.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 3.209765672683716, "kl": 0.004313843906857073, "learning_rate": 1.28e-06, "loss": -0.0427, "num_tokens": 85196.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.777777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.008018538355827332, "kl": 0.0002917383753811009, "learning_rate": 1.2850000000000002e-06, "loss": 0.0, "num_tokens": 85480.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 4.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03495369106531143, "kl": 0.002065973589196801, "learning_rate": 1.2900000000000001e-06, "loss": 0.0001, "num_tokens": 85796.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0792560800909996, "kl": 0.004229089594446123, "learning_rate": 1.295e-06, "loss": 0.0002, "num_tokens": 86015.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.833333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 7.595400810241699, "kl": 0.003271816996857524, "learning_rate": 1.3e-06, "loss": 0.0368, "num_tokens": 86311.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 4.851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10929770022630692, "kl": 0.001972758036572486, "learning_rate": 1.3050000000000002e-06, "loss": 0.0001, "num_tokens": 86584.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 4.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07193797826766968, "kl": 0.0037351057690102607, "learning_rate": 1.3100000000000002e-06, "loss": 0.0002, "num_tokens": 86876.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.2679197788238525, "kl": 0.0013694562949240208, "learning_rate": 1.3150000000000001e-06, "loss": 0.2543, "num_tokens": 87223.0, "reward": 2.625, "reward_std": 3.25, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 3.25, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 4.907407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.055516865104436874, "kl": 0.0009077079594135284, "learning_rate": 1.32e-06, "loss": 0.0, "num_tokens": 87435.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009803921915590763, "clip_ratio/low_min": 0.009803921915590763, "clip_ratio/region_mean": 0.009803921915590763, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 4.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.692701816558838, "kl": 0.003231094917282462, "learning_rate": 1.3250000000000002e-06, "loss": 0.1735, "num_tokens": 87752.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 4.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 3.640122175216675, "kl": 0.0005562437872868031, "learning_rate": 1.3300000000000002e-06, "loss": 0.0362, "num_tokens": 88084.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.136295795440674, "kl": 0.0020855419570580125, "learning_rate": 1.3350000000000001e-06, "loss": 0.0976, "num_tokens": 88415.0, "reward": 2.125, "reward_std": 4.007804870605469, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 4.007804870605469, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 4.981481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 3.1028292179107666, "kl": 0.0020239048171788454, "learning_rate": 1.34e-06, "loss": 0.0372, "num_tokens": 88705.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 5.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.353553295135498, "kl": 0.0006322429107967764, "learning_rate": 1.3450000000000003e-06, "loss": 0.2222, "num_tokens": 89058.0, "reward": 5.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.674234628677368, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 5.018518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.09564558416604996, "kl": 0.0013528501149266958, "learning_rate": 1.3500000000000002e-06, "loss": 0.0001, "num_tokens": 89303.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 93.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 93.75, "completions/mean_terminated_length": 39.66666793823242, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 5.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.4159562587738037, "kl": 0.0011875185591634363, "learning_rate": 1.3550000000000002e-06, "loss": 0.0548, "num_tokens": 89894.0, "reward": 2.125, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 1.6007810831069946, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 5.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 6.675926685333252, "kl": 0.0009131034748861566, "learning_rate": 1.3600000000000001e-06, "loss": -0.0027, "num_tokens": 90167.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.940035343170166, "kl": 0.0010095499746967107, "learning_rate": 1.3650000000000003e-06, "loss": 0.0003, "num_tokens": 90452.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 5.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 3.5658156871795654, "kl": 0.0003050342929782346, "learning_rate": 1.3700000000000002e-06, "loss": 0.0098, "num_tokens": 90712.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 5.111111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.002516094595193863, "kl": 0.00037193937168922275, "learning_rate": 1.3750000000000002e-06, "loss": 0.0, "num_tokens": 90992.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 5.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 7.5522871017456055, "kl": 0.0007623080164194107, "learning_rate": 1.3800000000000001e-06, "loss": 0.0053, "num_tokens": 91252.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 277 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.012820512987673283, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 5.148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.1035847663879395, "kl": 0.0019235096988268197, "learning_rate": 1.3850000000000003e-06, "loss": -0.0803, "num_tokens": 91583.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 5.166666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 7.858551502227783, "kl": 0.0016786588821560144, "learning_rate": 1.3900000000000002e-06, "loss": -0.0422, "num_tokens": 91870.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 5.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.360734462738037, "kl": 0.0014736028970219195, "learning_rate": 1.3950000000000002e-06, "loss": -0.0117, "num_tokens": 92262.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 5.203703703703703, "frac_reward_zero_std": 0.0, "grad_norm": 3.280803680419922, "kl": 0.0020239449804648757, "learning_rate": 1.4000000000000001e-06, "loss": 0.3463, "num_tokens": 92645.0, "reward": 2.25, "reward_std": 2.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 2.5, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 5.222222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.2747802734375, "kl": 0.0016221690457314253, "learning_rate": 1.4050000000000003e-06, "loss": 0.1456, "num_tokens": 92922.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 5.2407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.6026294231414795, "kl": 0.0028052098932676017, "learning_rate": 1.41e-06, "loss": 0.3746, "num_tokens": 93294.0, "reward": 5.625, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 2.462214469909668, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.2592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 7.727613925933838, "kl": 0.0013740895956289023, "learning_rate": 1.415e-06, "loss": -0.0013, "num_tokens": 93627.0, "reward": 2.875, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.4247870445251465, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 5.277777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.7345058917999268, "kl": 0.0008171926601789892, "learning_rate": 1.42e-06, "loss": -0.0288, "num_tokens": 94014.0, "reward": 2.125, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 1.6007810831069946, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.9202752113342285, "kl": 0.013990835752338171, "learning_rate": 1.425e-06, "loss": -0.3621, "num_tokens": 94384.0, "reward": 1.5, "reward_std": 4.242640495300293, "rewards/reward_combined/mean": 1.5, "rewards/reward_combined/std": 4.242640495300293, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 5.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.549915313720703, "kl": 0.0015825071022845805, "learning_rate": 1.43e-06, "loss": 0.0985, "num_tokens": 94646.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.333333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04797985404729843, "kl": 0.0013022526836721227, "learning_rate": 1.435e-06, "loss": 0.0001, "num_tokens": 94865.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 5.351851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.003663354553282261, "kl": 0.00016910582780838013, "learning_rate": 1.44e-06, "loss": 0.0, "num_tokens": 95153.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.030552180483937263, "kl": 0.0010455847004777752, "learning_rate": 1.445e-06, "loss": 0.0, "num_tokens": 95407.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 5.388888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 6.954070568084717, "kl": 0.0010619904496707022, "learning_rate": 1.45e-06, "loss": 0.0655, "num_tokens": 95706.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.037346817553043365, "kl": 0.0038795453729107976, "learning_rate": 1.455e-06, "loss": 0.0002, "num_tokens": 95994.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 5.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.6005020141601562, "kl": 0.0019933604053221643, "learning_rate": 1.46e-06, "loss": 0.0469, "num_tokens": 96381.0, "reward": 3.924999952316284, "reward_std": 3.9903006553649902, "rewards/reward_combined/mean": 3.924999952316284, "rewards/reward_combined/std": 3.9903006553649902, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 4.772615432739258, "kl": 0.028794721933081746, "learning_rate": 1.465e-06, "loss": -0.065, "num_tokens": 96703.0, "reward": 5.0, "reward_std": 3.316624879837036, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.316624879837036, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 5.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.894853115081787, "kl": 0.0005966611597614246, "learning_rate": 1.4700000000000001e-06, "loss": 0.2518, "num_tokens": 97053.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 5.481481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.00043081981129944324, "kl": 1.996755599975586e-06, "learning_rate": 1.475e-06, "loss": 0.0, "num_tokens": 97333.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.048335760831832886, "kl": 0.0007836855947971344, "learning_rate": 1.48e-06, "loss": 0.0, "num_tokens": 97545.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.518518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 2.0968191623687744, "kl": 0.0004068720736540854, "learning_rate": 1.485e-06, "loss": 0.001, "num_tokens": 97835.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 5.537037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.042074680328369, "kl": 0.012243836652487516, "learning_rate": 1.4900000000000001e-06, "loss": 0.001, "num_tokens": 98103.0, "reward": 7.5, "reward_std": 1.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 1.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.027033185586333275, "kl": 0.0016256864764727652, "learning_rate": 1.495e-06, "loss": 0.0001, "num_tokens": 98383.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.629542350769043, "kl": 0.0013911874557379633, "learning_rate": 1.5e-06, "loss": -0.0001, "num_tokens": 98683.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 14.058817863464355, "kl": 0.004208225756883621, "learning_rate": 1.505e-06, "loss": 0.1654, "num_tokens": 98903.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 5.611111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01391538605093956, "kl": 0.0005766674585174769, "learning_rate": 1.5100000000000002e-06, "loss": 0.0, "num_tokens": 99220.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 5.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 7.193515300750732, "kl": 0.0025334900710731745, "learning_rate": 1.5150000000000001e-06, "loss": 0.1121, "num_tokens": 99524.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 5.648148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.009462733753025532, "kl": 0.0008763819932937622, "learning_rate": 1.52e-06, "loss": 0.0, "num_tokens": 99736.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 5.666666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03811901807785034, "kl": 0.001641335489694029, "learning_rate": 1.525e-06, "loss": 0.0001, "num_tokens": 100065.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 5.685185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.044658053666353226, "kl": 0.001826379681006074, "learning_rate": 1.5300000000000002e-06, "loss": 0.0001, "num_tokens": 100329.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 5.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.508141040802002, "kl": 0.006664464715868235, "learning_rate": 1.5350000000000001e-06, "loss": 0.1466, "num_tokens": 100664.0, "reward": 1.375, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 1.375, "rewards/reward_combined/std": 1.4361406564712524, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 5.722222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.011096024885773659, "kl": 0.00015842317952774465, "learning_rate": 1.54e-06, "loss": 0.0, "num_tokens": 100920.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 5.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 6.860720157623291, "kl": 0.0011681760952342302, "learning_rate": 1.545e-06, "loss": 0.2296, "num_tokens": 101214.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 63.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 63.0, "completions/mean_terminated_length": 63.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 5.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.968238592147827, "kl": 0.0022370988153852522, "learning_rate": 1.5500000000000002e-06, "loss": 0.2768, "num_tokens": 101702.0, "reward": 1.4249999523162842, "reward_std": 1.8945096731185913, "rewards/reward_combined/mean": 1.4249999523162842, "rewards/reward_combined/std": 1.8945095539093018, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 5.777777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.021565312519669533, "kl": 0.0003906467172782868, "learning_rate": 1.5550000000000001e-06, "loss": 0.0, "num_tokens": 101936.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.020359372720122337, "kl": 0.0004955505282850936, "learning_rate": 1.56e-06, "loss": 0.0, "num_tokens": 102208.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015242390509229153, "kl": 3.46451997756958e-06, "learning_rate": 1.565e-06, "loss": 0.0, "num_tokens": 102428.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 5.833333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.2755470275878906, "kl": 0.0012736133649013937, "learning_rate": 1.5700000000000002e-06, "loss": 0.2188, "num_tokens": 102804.0, "reward": 6.550000190734863, "reward_std": 2.8999998569488525, "rewards/reward_combined/mean": 6.550000190734863, "rewards/reward_combined/std": 2.8999998569488525, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 5.851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.033371828496456146, "kl": 0.0022788856003899127, "learning_rate": 1.5750000000000002e-06, "loss": 0.0001, "num_tokens": 103118.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 5.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00048451771726831794, "kl": 2.7435521587904077e-05, "learning_rate": 1.5800000000000001e-06, "loss": 0.0, "num_tokens": 103482.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 5.888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 6.893566131591797, "kl": 0.0039166733622550964, "learning_rate": 1.585e-06, "loss": -0.0279, "num_tokens": 103778.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 5.907407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 2.8222768306732178, "kl": 0.0004977814678568393, "learning_rate": 1.5900000000000002e-06, "loss": -0.1084, "num_tokens": 104067.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 90.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 90.0, "completions/mean_terminated_length": 34.66666793823242, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 5.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.6621122360229492, "kl": 0.0006797134992666543, "learning_rate": 1.5950000000000002e-06, "loss": 0.3336, "num_tokens": 104707.0, "reward": -0.32499998807907104, "reward_std": 1.337597370147705, "rewards/reward_combined/mean": -0.32499998807907104, "rewards/reward_combined/std": 1.3375972509384155, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 5.944444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.02269612066447735, "kl": 0.0019208118319511414, "learning_rate": 1.6000000000000001e-06, "loss": 0.0001, "num_tokens": 104943.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 5.962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.009025105275213718, "kl": 0.0006974786520004272, "learning_rate": 1.605e-06, "loss": 0.0, "num_tokens": 105151.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 5.981481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 2.9681553840637207, "kl": 0.003019152325578034, "learning_rate": 1.6100000000000003e-06, "loss": 0.3459, "num_tokens": 105558.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 88.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 88.25, "completions/mean_terminated_length": 32.333335876464844, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 6.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.3669521808624268, "kl": 0.0023288443917408586, "learning_rate": 1.6150000000000002e-06, "loss": 0.3758, "num_tokens": 106139.0, "reward": 2.049999952316284, "reward_std": 2.245736837387085, "rewards/reward_combined/mean": 2.049999952316284, "rewards/reward_combined/std": 2.245736598968506, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 6.018518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.04396497458219528, "kl": 0.0012242788798175752, "learning_rate": 1.6200000000000002e-06, "loss": 0.0001, "num_tokens": 106403.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 6.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.3932714462280273, "kl": 0.005420995177701116, "learning_rate": 1.6250000000000001e-06, "loss": -0.0234, "num_tokens": 106751.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 6.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 4.4823079109191895, "kl": 0.004790059523656964, "learning_rate": 1.6300000000000003e-06, "loss": 0.0315, "num_tokens": 107048.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 6.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.0954411029815674, "kl": 0.0007266595494002104, "learning_rate": 1.6350000000000002e-06, "loss": 0.0017, "num_tokens": 107491.0, "reward": 1.4249999523162842, "reward_std": 1.0750969648361206, "rewards/reward_combined/mean": 1.4249999523162842, "rewards/reward_combined/std": 1.0750969648361206, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 6.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 6.212843418121338, "kl": 0.003046508179977536, "learning_rate": 1.6400000000000002e-06, "loss": 0.2032, "num_tokens": 107807.0, "reward": 1.25, "reward_std": 2.0615527629852295, "rewards/reward_combined/mean": 1.25, "rewards/reward_combined/std": 2.0615527629852295, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 6.111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 9.552132606506348, "kl": 0.0038169517647475004, "learning_rate": 1.6450000000000001e-06, "loss": 0.3941, "num_tokens": 108034.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 6.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.701140880584717, "kl": 0.01753268763422966, "learning_rate": 1.6500000000000003e-06, "loss": -0.0074, "num_tokens": 108272.0, "reward": 2.625, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 1.6007810831069946, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 6.148148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04008517041802406, "kl": 0.0016714408993721008, "learning_rate": 1.6550000000000002e-06, "loss": 0.0001, "num_tokens": 108532.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 6.166666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.004771340172737837, "kl": 0.0001906139004859142, "learning_rate": 1.6600000000000002e-06, "loss": 0.0, "num_tokens": 108840.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 6.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.021851925179362297, "kl": 0.0008589891076553613, "learning_rate": 1.6650000000000002e-06, "loss": 0.0, "num_tokens": 109113.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 6.203703703703703, "frac_reward_zero_std": 1.0, "grad_norm": 0.07419493049383163, "kl": 0.0011369846761226654, "learning_rate": 1.6700000000000003e-06, "loss": 0.0001, "num_tokens": 109325.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 6.222222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.013154740445315838, "kl": 0.0002784736134344712, "learning_rate": 1.6750000000000003e-06, "loss": 0.0, "num_tokens": 109560.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 6.2407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.026233648881316185, "kl": 0.003051085048355162, "learning_rate": 1.6800000000000002e-06, "loss": 0.0001, "num_tokens": 109856.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 6.2592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.042487844824790955, "kl": 0.002149757849110756, "learning_rate": 1.6850000000000002e-06, "loss": 0.0001, "num_tokens": 110128.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 6.277777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 5.455592155456543, "kl": 0.0010579637310002, "learning_rate": 1.6900000000000003e-06, "loss": 0.0038, "num_tokens": 110411.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 339 }, { "clip_ratio/high_max": 0.006097560748457909, "clip_ratio/high_mean": 0.006097560748457909, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006097560748457909, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 6.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.819808483123779, "kl": 0.0012327884614933282, "learning_rate": 1.6950000000000003e-06, "loss": 0.0936, "num_tokens": 110769.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 6.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.470825672149658, "kl": 0.0012864073505625129, "learning_rate": 1.7000000000000002e-06, "loss": 0.1586, "num_tokens": 111075.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 6.333333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.06834530085325241, "kl": 0.0027276099717710167, "learning_rate": 1.7050000000000002e-06, "loss": 0.0001, "num_tokens": 111294.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 6.351851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01180008240044117, "kl": 0.00045571177906822413, "learning_rate": 1.7100000000000004e-06, "loss": 0.0, "num_tokens": 111554.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.780444145202637, "kl": 0.0035271833185106516, "learning_rate": 1.7150000000000003e-06, "loss": 0.2041, "num_tokens": 111843.0, "reward": 4.25, "reward_std": 3.752776622772217, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 3.752776861190796, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 6.388888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.046812057495117, "kl": 0.00504549453034997, "learning_rate": 1.72e-06, "loss": -0.0701, "num_tokens": 112210.0, "reward": 3.875, "reward_std": 2.9545164108276367, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 2.9545164108276367, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 85.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 85.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 6.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 2.3173816204071045, "kl": 0.003237336641177535, "learning_rate": 1.725e-06, "loss": 0.502, "num_tokens": 112770.0, "reward": 0.875, "reward_std": 4.643543720245361, "rewards/reward_combined/mean": 0.875, "rewards/reward_combined/std": 4.643543720245361, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 6.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.8316650390625, "kl": 0.0031989957205951214, "learning_rate": 1.73e-06, "loss": 0.0133, "num_tokens": 113100.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 6.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.08387290686368942, "kl": 0.004850049968808889, "learning_rate": 1.7350000000000001e-06, "loss": 0.0003, "num_tokens": 113390.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 6.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.903375625610352, "kl": 0.006802509073168039, "learning_rate": 1.74e-06, "loss": 0.0698, "num_tokens": 113689.0, "reward": 4.75, "reward_std": 3.617089033126831, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.61708927154541, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 70.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 70.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 6.481481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 2.2876973152160645, "kl": 0.00017722812481224537, "learning_rate": 1.745e-06, "loss": 0.4711, "num_tokens": 114181.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 6.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.514647960662842, "kl": 0.0017127252649515867, "learning_rate": 1.75e-06, "loss": 0.0143, "num_tokens": 114514.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 6.518518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 2.166140556335449, "kl": 0.0003040807096112985, "learning_rate": 1.7550000000000001e-06, "loss": -0.0139, "num_tokens": 114878.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 6.537037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.206137180328369, "kl": 0.001418448518961668, "learning_rate": 1.76e-06, "loss": 0.1152, "num_tokens": 115211.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.018450049683451653, "kl": 0.0004337641003075987, "learning_rate": 1.765e-06, "loss": 0.0, "num_tokens": 115483.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 6.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 6.061115741729736, "kl": 0.007278661010786891, "learning_rate": 1.77e-06, "loss": 0.0512, "num_tokens": 115785.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 6.592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 5.148343563079834, "kl": 0.002805492316838354, "learning_rate": 1.7750000000000002e-06, "loss": 0.1054, "num_tokens": 116140.0, "reward": 1.25, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 1.25, "rewards/reward_combined/std": 1.190238118171692, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 6.611111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.840747356414795, "kl": 0.003700417000800371, "learning_rate": 1.7800000000000001e-06, "loss": 0.1741, "num_tokens": 116473.0, "reward": 2.0, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 1.7320507764816284, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 6.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012292632891330868, "kl": 2.5704503059387207e-06, "learning_rate": 1.785e-06, "loss": 0.0, "num_tokens": 116693.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.648148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.811054229736328, "kl": 0.005083501571789384, "learning_rate": 1.79e-06, "loss": -0.0652, "num_tokens": 116991.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.666666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 4.038796424865723, "kl": 0.001818744873162359, "learning_rate": 1.7950000000000002e-06, "loss": 0.0196, "num_tokens": 117270.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.268385171890259, "kl": 0.0008640390879008919, "learning_rate": 1.8000000000000001e-06, "loss": 0.0351, "num_tokens": 117561.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 6.703703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01058545708656311, "kl": 0.0009899064898490906, "learning_rate": 1.805e-06, "loss": 0.0, "num_tokens": 117873.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 6.722222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.013949384912848473, "kl": 0.00021659881895175204, "learning_rate": 1.81e-06, "loss": 0.0, "num_tokens": 118129.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 6.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 5.025087356567383, "kl": 0.0004866891395067796, "learning_rate": 1.8150000000000002e-06, "loss": 0.074, "num_tokens": 118422.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.25, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 6.7592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 3.70544171333313, "kl": 0.0019905922235921025, "learning_rate": 1.8200000000000002e-06, "loss": 0.2878, "num_tokens": 118831.0, "reward": 2.375, "reward_std": 3.8810436725616455, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 3.8810436725616455, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 6.777777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.004724662750959396, "kl": 7.076895053614862e-05, "learning_rate": 1.825e-06, "loss": 0.0, "num_tokens": 119155.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 79.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 79.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 6.796296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.947731375694275, "kl": 0.001046268007485196, "learning_rate": 1.83e-06, "loss": 0.3602, "num_tokens": 119718.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.456359386444092, "kl": 0.0022804804029874504, "learning_rate": 1.8350000000000002e-06, "loss": 0.0205, "num_tokens": 120003.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 70.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 70.25, "completions/mean_terminated_length": 8.333333969116211, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 6.833333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.9383633136749268, "kl": 0.006497891154140234, "learning_rate": 1.8400000000000002e-06, "loss": -0.0824, "num_tokens": 120484.0, "reward": 1.2999999523162842, "reward_std": 3.155946731567383, "rewards/reward_combined/mean": 1.2999999523162842, "rewards/reward_combined/std": 3.155946969985962, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.75, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 6.851851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.796783208847046, "kl": 0.004132682224735618, "learning_rate": 1.8450000000000001e-06, "loss": 0.0441, "num_tokens": 120903.0, "reward": 2.5, "reward_std": 4.102844715118408, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 4.102844715118408, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 6.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.752232074737549, "kl": 0.007602685363963246, "learning_rate": 1.85e-06, "loss": 0.0031, "num_tokens": 121165.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 6.888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 8.72385025024414, "kl": 0.007616253104060888, "learning_rate": 1.8550000000000002e-06, "loss": -0.0013, "num_tokens": 121425.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 6.907407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 3.2291672229766846, "kl": 0.004002431873232126, "learning_rate": 1.8600000000000002e-06, "loss": 0.1075, "num_tokens": 121746.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.674001693725586, "kl": 0.0027351422468200326, "learning_rate": 1.8650000000000001e-06, "loss": 0.0178, "num_tokens": 122015.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 6.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 1.761277675628662, "kl": 0.005078910966403782, "learning_rate": 1.87e-06, "loss": -0.2941, "num_tokens": 122426.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 6.962962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10792737454175949, "kl": 0.0033355706837028265, "learning_rate": 1.8750000000000003e-06, "loss": 0.0002, "num_tokens": 122692.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 6.981481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.015270176343619823, "kl": 0.001400600653141737, "learning_rate": 1.8800000000000002e-06, "loss": 0.0001, "num_tokens": 122960.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 7.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03814214840531349, "kl": 0.0016468316316604614, "learning_rate": 1.8850000000000002e-06, "loss": 0.0001, "num_tokens": 123166.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 7.018518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.010253285989165306, "kl": 0.0018462538719177246, "learning_rate": 1.8900000000000001e-06, "loss": 0.0001, "num_tokens": 123378.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 7.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.582128524780273, "kl": 0.009025686187669635, "learning_rate": 1.895e-06, "loss": 0.0455, "num_tokens": 123751.0, "reward": 2.5, "reward_std": 1.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.01120007038116455, "kl": 0.0005121499416418374, "learning_rate": 1.9000000000000002e-06, "loss": 0.0, "num_tokens": 124011.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.010638297535479069, "clip_ratio/high_mean": 0.010638297535479069, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010638297535479069, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 7.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 6.716611862182617, "kl": 0.004164873389527202, "learning_rate": 1.9050000000000002e-06, "loss": 0.075, "num_tokens": 124355.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 7.092592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 4.915200710296631, "kl": 0.003422527341172099, "learning_rate": 1.9100000000000003e-06, "loss": -0.0663, "num_tokens": 124653.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 7.111111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06738001853227615, "kl": 0.001955825078766793, "learning_rate": 1.9150000000000003e-06, "loss": 0.0001, "num_tokens": 124921.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 7.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.025637509301304817, "kl": 0.003939539194107056, "learning_rate": 1.9200000000000003e-06, "loss": 0.0002, "num_tokens": 125157.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.926583766937256, "kl": 0.0016816090210340917, "learning_rate": 1.925e-06, "loss": -0.0622, "num_tokens": 125436.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 7.166666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03291429579257965, "kl": 0.001330711878836155, "learning_rate": 1.93e-06, "loss": 0.0001, "num_tokens": 125709.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.7526140213012695, "kl": 0.004410982830449939, "learning_rate": 1.935e-06, "loss": 0.1687, "num_tokens": 126025.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 7.203703703703703, "frac_reward_zero_std": 1.0, "grad_norm": 0.004800163675099611, "kl": 0.00019893401622539386, "learning_rate": 1.94e-06, "loss": 0.0, "num_tokens": 126333.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 7.222222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.028314903378486633, "kl": 0.0017643510363996029, "learning_rate": 1.945e-06, "loss": 0.0001, "num_tokens": 126647.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 7.2407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.3853137493133545, "kl": 0.0013765509356744587, "learning_rate": 1.9500000000000004e-06, "loss": 0.1067, "num_tokens": 126988.0, "reward": 3.25, "reward_std": 3.3040380477905273, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 3.3040380477905273, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 7.2592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.055021144449710846, "kl": 0.0054574329406023026, "learning_rate": 1.9550000000000003e-06, "loss": 0.0003, "num_tokens": 127268.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.277777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.001960948808118701, "kl": 0.00013681948621524498, "learning_rate": 1.9600000000000003e-06, "loss": 0.0, "num_tokens": 127524.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 7.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.2811481952667236, "kl": 0.0009387581958435476, "learning_rate": 1.9650000000000002e-06, "loss": 0.0007, "num_tokens": 127939.0, "reward": 0.7250000238418579, "reward_std": 1.053169846534729, "rewards/reward_combined/mean": 0.7250000238418579, "rewards/reward_combined/std": 1.0531699657440186, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 7.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.487381935119629, "kl": 0.011904643382877111, "learning_rate": 1.97e-06, "loss": -0.043, "num_tokens": 128279.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011627906933426857, "clip_ratio/low_min": 0.011627906933426857, "clip_ratio/region_mean": 0.011627906933426857, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 7.333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.191708564758301, "kl": 0.006280530709773302, "learning_rate": 1.975e-06, "loss": 0.0635, "num_tokens": 128590.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 7.351851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.614377498626709, "kl": 0.004639265360310674, "learning_rate": 1.98e-06, "loss": 0.1139, "num_tokens": 128917.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 7.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015437055844813585, "kl": 3.553926944732666e-06, "learning_rate": 1.985e-06, "loss": 0.0, "num_tokens": 129137.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 79.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 79.5, "completions/mean_terminated_length": 20.666667938232422, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 7.388888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.5996809005737305, "kl": 0.0034413100220263004, "learning_rate": 1.9900000000000004e-06, "loss": 0.4625, "num_tokens": 129707.0, "reward": 4.125, "reward_std": 5.202163219451904, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 5.202163219451904, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 5.066418170928955, "kl": 0.0029476784402504563, "learning_rate": 1.9950000000000004e-06, "loss": 0.0207, "num_tokens": 129981.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 7.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.647677898406982, "kl": 0.0007448247633874416, "learning_rate": 2.0000000000000003e-06, "loss": 0.0914, "num_tokens": 130305.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 7.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 4.665159225463867, "kl": 0.008138323668390512, "learning_rate": 2.0050000000000003e-06, "loss": -0.0638, "num_tokens": 130635.0, "reward": 3.125, "reward_std": 3.0652623176574707, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 3.06526255607605, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 94.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 94.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 7.462962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.9898173809051514, "kl": 0.002892374526709318, "learning_rate": 2.0100000000000002e-06, "loss": 0.2367, "num_tokens": 131231.0, "reward": -0.3250000476837158, "reward_std": 2.7060117721557617, "rewards/reward_combined/mean": -0.3250000476837158, "rewards/reward_combined/std": 2.7060117721557617, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 85.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 85.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 7.481481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 1.8234716653823853, "kl": 0.00342160917352885, "learning_rate": 2.015e-06, "loss": -0.062, "num_tokens": 131787.0, "reward": 0.800000011920929, "reward_std": 1.9815819263458252, "rewards/reward_combined/mean": 0.800000011920929, "rewards/reward_combined/std": 1.9815819263458252, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 7.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.208947658538818, "kl": 0.006619830150157213, "learning_rate": 2.02e-06, "loss": -0.0933, "num_tokens": 132107.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 7.518518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.012280282564461231, "kl": 0.0002984652091981843, "learning_rate": 2.025e-06, "loss": 0.0, "num_tokens": 132401.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.537037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.040411852300167084, "kl": 0.00209359492873773, "learning_rate": 2.0300000000000005e-06, "loss": 0.0001, "num_tokens": 132680.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 7.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.00458539929240942, "kl": 0.00021530389494728297, "learning_rate": 2.035e-06, "loss": 0.0, "num_tokens": 132900.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.034151263535022736, "kl": 0.0020523788989521563, "learning_rate": 2.04e-06, "loss": 0.0001, "num_tokens": 133185.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 5.052777290344238, "kl": 0.007712302263826132, "learning_rate": 2.045e-06, "loss": 0.2256, "num_tokens": 133498.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 7.611111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005138571723364294, "kl": 4.1673556552268565e-05, "learning_rate": 2.05e-06, "loss": 0.0, "num_tokens": 133861.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 7.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.115057945251465, "kl": 0.010108688846230507, "learning_rate": 2.0550000000000002e-06, "loss": -0.0697, "num_tokens": 134149.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 7.648148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.027117323130369186, "kl": 0.0017401889199391007, "learning_rate": 2.06e-06, "loss": 0.0001, "num_tokens": 134452.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 7.666666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031791343353688717, "kl": 0.0001436642269254662, "learning_rate": 2.065e-06, "loss": 0.0, "num_tokens": 134687.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 7.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.275808811187744, "kl": 0.00408494658768177, "learning_rate": 2.07e-06, "loss": 0.0666, "num_tokens": 134975.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 7.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 6.721722602844238, "kl": 0.0038016302278265357, "learning_rate": 2.075e-06, "loss": -0.0106, "num_tokens": 135247.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 7.722222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 9.70695972442627, "kl": 0.0038458160415757447, "learning_rate": 2.08e-06, "loss": 0.1559, "num_tokens": 135514.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.7407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0528675802052021, "kl": 0.001092689752113074, "learning_rate": 2.085e-06, "loss": 0.0001, "num_tokens": 135780.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 7.7592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.04101642593741417, "kl": 0.0019759200513362885, "learning_rate": 2.09e-06, "loss": 0.0001, "num_tokens": 136040.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 84.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 84.75, "completions/mean_terminated_length": 27.666667938232422, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 7.777777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 6.21834135055542, "kl": 0.022096805972978473, "learning_rate": 2.0950000000000003e-06, "loss": 0.128, "num_tokens": 136619.0, "reward": 3.049999952316284, "reward_std": 3.503807544708252, "rewards/reward_combined/mean": 3.049999952316284, "rewards/reward_combined/std": 3.503807544708252, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 7.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.030547378584742546, "kl": 0.004664515145123005, "learning_rate": 2.1000000000000002e-06, "loss": 0.0002, "num_tokens": 136931.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 7.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.025615297257900238, "kl": 0.0002962619037134573, "learning_rate": 2.105e-06, "loss": 0.0, "num_tokens": 137144.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 82.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 7.833333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.5301644802093506, "kl": 0.008221306139603257, "learning_rate": 2.11e-06, "loss": 0.507, "num_tokens": 137724.0, "reward": 1.625, "reward_std": 4.210601806640625, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 4.210601806640625, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 7.851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.012531968764960766, "kl": 0.0017801051144488156, "learning_rate": 2.115e-06, "loss": 0.0001, "num_tokens": 137988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 7.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06827722489833832, "kl": 0.00219533103518188, "learning_rate": 2.12e-06, "loss": 0.0001, "num_tokens": 138200.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 7.888888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.10145949572324753, "kl": 0.007357046008110046, "learning_rate": 2.125e-06, "loss": 0.0004, "num_tokens": 138494.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 7.907407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0322817824780941, "kl": 0.00988923991099, "learning_rate": 2.13e-06, "loss": 0.0005, "num_tokens": 138754.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 7.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03341434895992279, "kl": 0.0007542030070908368, "learning_rate": 2.1350000000000003e-06, "loss": 0.0, "num_tokens": 139002.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.010638297535479069, "clip_ratio/high_mean": 0.010638297535479069, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010638297535479069, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 7.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 4.309067249298096, "kl": 0.00475471664685756, "learning_rate": 2.1400000000000003e-06, "loss": 0.0177, "num_tokens": 139354.0, "reward": 3.5, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 4.690415859222412, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 7.962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.035886764526367, "kl": 0.0010013999417424202, "learning_rate": 2.1450000000000002e-06, "loss": 0.0678, "num_tokens": 139621.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 7.981481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 9.136139869689941, "kl": 0.036479562520980835, "learning_rate": 2.15e-06, "loss": 0.3548, "num_tokens": 139867.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.5706787109375, "kl": 0.005276472773402929, "learning_rate": 2.155e-06, "loss": 0.3533, "num_tokens": 140188.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 8.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.04853292554616928, "kl": 0.004319248721003532, "learning_rate": 2.16e-06, "loss": 0.0002, "num_tokens": 140480.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 8.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763107866048813, "kl": 0.0030780192464590073, "learning_rate": 2.165e-06, "loss": 0.0002, "num_tokens": 140794.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 6.639495372772217, "kl": 0.008489200845360756, "learning_rate": 2.17e-06, "loss": -0.0172, "num_tokens": 141086.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 8.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.926559925079346, "kl": 0.004712743917480111, "learning_rate": 2.1750000000000004e-06, "loss": 0.1064, "num_tokens": 141354.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 8.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.041918523609638214, "kl": 0.0037772515206597745, "learning_rate": 2.1800000000000003e-06, "loss": 0.0002, "num_tokens": 141624.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 8.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.8320810794830322, "kl": 0.010540789924561977, "learning_rate": 2.1850000000000003e-06, "loss": 0.214, "num_tokens": 141991.0, "reward": 3.625, "reward_std": 2.75, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 2.75, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 83.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 83.25, "completions/mean_terminated_length": 25.666667938232422, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 8.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.1911678314208984, "kl": 0.004377743229269981, "learning_rate": 2.19e-06, "loss": 0.1008, "num_tokens": 142552.0, "reward": 2.625, "reward_std": 3.567795753479004, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 3.567795753479004, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.148148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.05620883032679558, "kl": 0.004248459299560636, "learning_rate": 2.195e-06, "loss": 0.0002, "num_tokens": 142852.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 8.166666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.09759488701820374, "kl": 0.0076660343911498785, "learning_rate": 2.2e-06, "loss": 0.0004, "num_tokens": 143176.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 8.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.5405514240264893, "kl": 0.0011683102929964662, "learning_rate": 2.205e-06, "loss": -0.0001, "num_tokens": 143539.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07645189017057419, "kl": 0.0036318821366876364, "learning_rate": 2.21e-06, "loss": 0.0002, "num_tokens": 143800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.222222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 5.655681610107422, "kl": 0.00352186756208539, "learning_rate": 2.2150000000000004e-06, "loss": 0.0043, "num_tokens": 144086.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 8.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.010776776820421219, "kl": 0.001828725216910243, "learning_rate": 2.2200000000000003e-06, "loss": 0.0001, "num_tokens": 144350.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.031225670129060745, "kl": 0.0034308823524042964, "learning_rate": 2.2250000000000003e-06, "loss": 0.0002, "num_tokens": 144634.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 8.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.10181907564401627, "kl": 0.011331729125231504, "learning_rate": 2.2300000000000002e-06, "loss": 0.0006, "num_tokens": 144927.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 8.296296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.014530030079185963, "kl": 0.0006197964248713106, "learning_rate": 2.235e-06, "loss": 0.0, "num_tokens": 145185.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 8.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884822994470596, "kl": 0.01525877183303237, "learning_rate": 2.24e-06, "loss": 0.0008, "num_tokens": 145507.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034358676057308912, "kl": 5.413840335677378e-05, "learning_rate": 2.245e-06, "loss": 0.0, "num_tokens": 145779.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 8.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.03183302283287048, "kl": 0.0020237138378433883, "learning_rate": 2.25e-06, "loss": 0.0001, "num_tokens": 146109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 8.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 10.089832305908203, "kl": 0.004224767675623298, "learning_rate": 2.2550000000000004e-06, "loss": 0.0391, "num_tokens": 146370.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.6209328174591064, "kl": 0.006678661098703742, "learning_rate": 2.2600000000000004e-06, "loss": 0.3928, "num_tokens": 146745.0, "reward": 7.5, "reward_std": 1.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 1.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 4.086695671081543, "kl": 0.02766399085521698, "learning_rate": 2.2650000000000003e-06, "loss": 0.2354, "num_tokens": 147051.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 8.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04221894592046738, "kl": 0.001018628478050232, "learning_rate": 2.2700000000000003e-06, "loss": 0.0001, "num_tokens": 147263.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 4.192069053649902, "kl": 0.016187540255486965, "learning_rate": 2.2750000000000002e-06, "loss": 0.2785, "num_tokens": 147552.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 8.462962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006116730510257185, "kl": 2.215057611465454e-05, "learning_rate": 2.28e-06, "loss": 0.0, "num_tokens": 147772.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 8.481481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 11.311041831970215, "kl": 0.018162449821829796, "learning_rate": 2.285e-06, "loss": 0.3263, "num_tokens": 148004.0, "reward": 2.625, "reward_std": 2.75, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 2.75, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.157555103302002, "kl": 0.015731837134808302, "learning_rate": 2.29e-06, "loss": 0.0372, "num_tokens": 148303.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 8.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05385258421301842, "kl": 0.0013910308480262756, "learning_rate": 2.2950000000000005e-06, "loss": 0.0001, "num_tokens": 148511.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 8.537037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 2.1325249671936035, "kl": 0.0016097993939183652, "learning_rate": 2.3000000000000004e-06, "loss": -0.0013, "num_tokens": 148947.0, "reward": 1.100000023841858, "reward_std": 1.2701705694198608, "rewards/reward_combined/mean": 1.100000023841858, "rewards/reward_combined/std": 1.2701706886291504, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 8.555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 5.6328535079956055, "kl": 0.014899402856826782, "learning_rate": 2.3050000000000004e-06, "loss": 0.0258, "num_tokens": 149254.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 8.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059350584633648396, "kl": 0.00032161743729375303, "learning_rate": 2.3100000000000003e-06, "loss": 0.0, "num_tokens": 149489.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 141.75, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 141.75, "completions/mean_terminated_length": 27.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 8.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.6204261779785156, "kl": 0.0055110863177105784, "learning_rate": 2.3150000000000003e-06, "loss": -0.0232, "num_tokens": 150280.0, "reward": 2.3499999046325684, "reward_std": 4.464303016662598, "rewards/reward_combined/mean": 2.3499999046325684, "rewards/reward_combined/std": 4.464303016662598, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.2131420522928238, "kl": 0.016977327992208302, "learning_rate": 2.3200000000000002e-06, "loss": 0.0009, "num_tokens": 150583.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 79.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 79.5, "completions/mean_terminated_length": 20.666667938232422, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 8.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.2734646797180176, "kl": 0.006123075494542718, "learning_rate": 2.325e-06, "loss": 0.4261, "num_tokens": 151121.0, "reward": 4.550000190734863, "reward_std": 5.900000095367432, "rewards/reward_combined/mean": 4.550000190734863, "rewards/reward_combined/std": 5.90000057220459, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 8.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 9.942984580993652, "kl": 0.0046069566160440445, "learning_rate": 2.33e-06, "loss": 0.0588, "num_tokens": 151367.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 84.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 84.5, "completions/mean_terminated_length": 27.33333396911621, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 8.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.848112106323242, "kl": 0.04146038042381406, "learning_rate": 2.3350000000000005e-06, "loss": 0.1692, "num_tokens": 151945.0, "reward": 4.75, "reward_std": 4.092676162719727, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 4.092676162719727, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 8.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.164795398712158, "kl": 0.018968064337968826, "learning_rate": 2.3400000000000005e-06, "loss": 0.1118, "num_tokens": 152279.0, "reward": 4.25, "reward_std": 4.092676162719727, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 4.092676162719727, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 8.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.2461397647857666, "kl": 0.0036468070466071367, "learning_rate": 2.345e-06, "loss": 0.1015, "num_tokens": 152606.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 8.722222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 2.995934009552002, "kl": 0.01234549144282937, "learning_rate": 2.35e-06, "loss": -0.1299, "num_tokens": 152977.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 1.4361406564712524, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.76177453994751, "kl": 0.038873836398124695, "learning_rate": 2.355e-06, "loss": 0.0377, "num_tokens": 153257.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 472 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.013494318351149559, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 8.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.6669111251831055, "kl": 0.009300600038841367, "learning_rate": 2.3600000000000003e-06, "loss": 0.0855, "num_tokens": 153625.0, "reward": 0.0, "reward_std": 1.154700517654419, "rewards/reward_combined/mean": 0.0, "rewards/reward_combined/std": 1.154700517654419, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 8.777777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.009261498227715492, "kl": 0.00020227262575645, "learning_rate": 2.3650000000000002e-06, "loss": 0.0, "num_tokens": 153905.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013513513840734959, "clip_ratio/low_min": 0.013513513840734959, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.796296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.672394275665283, "kl": 0.16028425097465515, "learning_rate": 2.37e-06, "loss": 0.1146, "num_tokens": 154200.0, "reward": 1.0, "reward_std": 2.4494898319244385, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 2.4494898319244385, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 8.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 12.555069923400879, "kl": 0.002975350624183193, "learning_rate": 2.375e-06, "loss": 0.0052, "num_tokens": 154420.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 8.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.008093948476016521, "kl": 0.001128108036937192, "learning_rate": 2.38e-06, "loss": 0.0001, "num_tokens": 154697.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 8.851851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.008237867616117, "kl": 0.00013291239883983508, "learning_rate": 2.385e-06, "loss": 0.0, "num_tokens": 154953.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 8.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05019519105553627, "kl": 0.009890516754239798, "learning_rate": 2.39e-06, "loss": 0.0005, "num_tokens": 155287.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 8.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 6.414970397949219, "kl": 0.047585778404027224, "learning_rate": 2.395e-06, "loss": 0.0726, "num_tokens": 155550.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 8.907407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.12921972572803497, "kl": 0.010220121592283249, "learning_rate": 2.4000000000000003e-06, "loss": 0.0005, "num_tokens": 155866.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 8.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.009771255776286125, "kl": 0.004996911622583866, "learning_rate": 2.4050000000000003e-06, "loss": 0.0002, "num_tokens": 156178.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 8.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 4.554996490478516, "kl": 0.0018520914018154144, "learning_rate": 2.4100000000000002e-06, "loss": 0.1026, "num_tokens": 156514.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 8.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.026373308151960373, "kl": 0.005340062081813812, "learning_rate": 2.415e-06, "loss": 0.0003, "num_tokens": 156750.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 8.981481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.12289481610059738, "kl": 0.014984317123889923, "learning_rate": 2.42e-06, "loss": 0.0007, "num_tokens": 157036.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 9.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.010437137447297573, "kl": 0.0025793612003326416, "learning_rate": 2.425e-06, "loss": 0.0001, "num_tokens": 157248.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 9.018518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.812546253204346, "kl": 0.04968047788133845, "learning_rate": 2.43e-06, "loss": 0.1843, "num_tokens": 157597.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 9.037037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 5.656309127807617, "kl": 0.008302829344756901, "learning_rate": 2.435e-06, "loss": 0.2856, "num_tokens": 157921.0, "reward": 5.375, "reward_std": 5.25, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 5.25, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 9.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.007052761502563953, "kl": 0.00015019923012005165, "learning_rate": 2.4400000000000004e-06, "loss": 0.0, "num_tokens": 158177.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 9.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.241264343261719, "kl": 0.026102915406227112, "learning_rate": 2.4450000000000003e-06, "loss": 0.1025, "num_tokens": 158489.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 9.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.10147115588188171, "kl": 0.010728138498961926, "learning_rate": 2.4500000000000003e-06, "loss": 0.0005, "num_tokens": 158761.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 9.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07248181104660034, "kl": 0.006980241741985083, "learning_rate": 2.4550000000000002e-06, "loss": 0.0004, "num_tokens": 159033.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 9.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.005719267297536135, "kl": 0.00029906454437877983, "learning_rate": 2.46e-06, "loss": 0.0, "num_tokens": 159341.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 9.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 9.978959083557129, "kl": 0.04948511766269803, "learning_rate": 2.465e-06, "loss": -0.2752, "num_tokens": 159572.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 9.166666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.390055537223816, "kl": 0.14080512756481767, "learning_rate": 2.47e-06, "loss": -0.0363, "num_tokens": 160018.0, "reward": -0.40000003576278687, "reward_std": 2.765260696411133, "rewards/reward_combined/mean": -0.40000003576278687, "rewards/reward_combined/std": 2.765260696411133, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 9.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.010353906080126762, "kl": 0.0026435106992721558, "learning_rate": 2.475e-06, "loss": 0.0001, "num_tokens": 160230.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.006666666828095913, "clip_ratio/high_mean": 0.006666666828095913, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006666666828095913, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 9.203703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.0811069011688232, "kl": 0.02197659108787775, "learning_rate": 2.4800000000000004e-06, "loss": -0.0074, "num_tokens": 160600.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 9.222222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.001300562173128128, "kl": 5.476176738739014e-05, "learning_rate": 2.4850000000000003e-06, "loss": 0.0, "num_tokens": 160820.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 150.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 9.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.1753973960876465, "kl": 0.0035257707932032645, "learning_rate": 2.4900000000000003e-06, "loss": 0.1187, "num_tokens": 161636.0, "reward": 0.42500001192092896, "reward_std": 2.3641419410705566, "rewards/reward_combined/mean": 0.42500001192092896, "rewards/reward_combined/std": 2.3641419410705566, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 9.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.011591876856982708, "kl": 0.0019661643891595304, "learning_rate": 2.4950000000000003e-06, "loss": 0.0001, "num_tokens": 161948.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 9.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.08762506395578384, "kl": 0.007637038710527122, "learning_rate": 2.5e-06, "loss": 0.0004, "num_tokens": 162217.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.296296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0217191930860281, "kl": 0.0035082323011010885, "learning_rate": 2.505e-06, "loss": 0.0002, "num_tokens": 162516.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.24668025970459, "kl": 0.05380616895854473, "learning_rate": 2.51e-06, "loss": -0.0807, "num_tokens": 162819.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 9.333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.025283321738243103, "kl": 0.005985692143440247, "learning_rate": 2.515e-06, "loss": 0.0003, "num_tokens": 163055.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 9.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.017328931018710136, "kl": 0.0011993437656201422, "learning_rate": 2.52e-06, "loss": 0.0001, "num_tokens": 163421.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 9.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07956413179636002, "kl": 0.005813772324472666, "learning_rate": 2.5250000000000004e-06, "loss": 0.0003, "num_tokens": 163735.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07927875965833664, "kl": 0.008359069935977459, "learning_rate": 2.5300000000000003e-06, "loss": 0.0004, "num_tokens": 164020.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 9.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 5.659533500671387, "kl": 0.012495339615270495, "learning_rate": 2.5350000000000003e-06, "loss": -0.0346, "num_tokens": 164326.0, "reward": 3.875, "reward_std": 3.092329263687134, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 3.092329263687134, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 9.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04890467971563339, "kl": 0.005758650600910187, "learning_rate": 2.5400000000000002e-06, "loss": 0.0003, "num_tokens": 164662.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 9.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 5.891786098480225, "kl": 0.01453774468973279, "learning_rate": 2.545e-06, "loss": -0.001, "num_tokens": 164963.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.462962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 8.461947441101074, "kl": 0.021690708585083485, "learning_rate": 2.55e-06, "loss": -0.0872, "num_tokens": 165259.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.09424034506082535, "kl": 0.006238766363821924, "learning_rate": 2.555e-06, "loss": 0.0003, "num_tokens": 165537.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.25, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.33333396911621, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 9.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.10210379213094711, "kl": 0.007286053616553545, "learning_rate": 2.56e-06, "loss": 0.0004, "num_tokens": 165875.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 9.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.6359588503837585, "kl": 0.09753054194152355, "learning_rate": 2.5650000000000004e-06, "loss": 0.0047, "num_tokens": 166169.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 9.537037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 5.064944744110107, "kl": 0.027561096474528313, "learning_rate": 2.5700000000000004e-06, "loss": 0.1688, "num_tokens": 166498.0, "reward": 3.625, "reward_std": 5.202163219451904, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 5.202163219451904, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 9.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.045751217752695084, "kl": 0.0027473419904708862, "learning_rate": 2.5750000000000003e-06, "loss": 0.0001, "num_tokens": 166758.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 9.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.9281381368637085, "kl": 0.006388864479959011, "learning_rate": 2.5800000000000003e-06, "loss": 0.0231, "num_tokens": 167076.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 9.592592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.003929498139768839, "kl": 0.0007419266912620515, "learning_rate": 2.5850000000000002e-06, "loss": 0.0, "num_tokens": 167356.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 9.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.10881363600492477, "kl": 0.012780678225681186, "learning_rate": 2.59e-06, "loss": 0.0006, "num_tokens": 167662.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 9.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07231725752353668, "kl": 0.001100875437259674, "learning_rate": 2.595e-06, "loss": 0.0001, "num_tokens": 167874.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.648148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.08626428991556168, "kl": 0.00836819305550307, "learning_rate": 2.6e-06, "loss": 0.0004, "num_tokens": 168158.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.666666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.13831181824207306, "kl": 0.020632595289498568, "learning_rate": 2.6050000000000005e-06, "loss": 0.0011, "num_tokens": 168425.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.131930828094482, "kl": 0.01738014444708824, "learning_rate": 2.6100000000000004e-06, "loss": -0.0015, "num_tokens": 168710.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 9.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.205351829528809, "kl": 0.711861215531826, "learning_rate": 2.6150000000000004e-06, "loss": 0.1458, "num_tokens": 168928.0, "reward": 2.5, "reward_std": 3.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 9.722222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 6.978946208953857, "kl": 0.021479977294802666, "learning_rate": 2.6200000000000003e-06, "loss": 0.0342, "num_tokens": 169264.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 9.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0965760350227356, "kl": 0.004969261586666107, "learning_rate": 2.6250000000000003e-06, "loss": 0.0002, "num_tokens": 169516.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 9.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.472059726715088, "kl": 0.017159564420580864, "learning_rate": 2.6300000000000002e-06, "loss": -0.1063, "num_tokens": 169842.0, "reward": 0.75, "reward_std": 2.0615527629852295, "rewards/reward_combined/mean": 0.75, "rewards/reward_combined/std": 2.0615527629852295, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 131.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 90.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 9.777777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 1.795655369758606, "kl": 0.007473572622984648, "learning_rate": 2.635e-06, "loss": 0.0541, "num_tokens": 170604.0, "reward": 0.9750000238418579, "reward_std": 1.158663034439087, "rewards/reward_combined/mean": 0.9750000238418579, "rewards/reward_combined/std": 1.158663034439087, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 9.796296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 10.253057479858398, "kl": 0.03040765505284071, "learning_rate": 2.64e-06, "loss": 0.2138, "num_tokens": 170875.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 9.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.009860508143901825, "kl": 0.0006473585963249207, "learning_rate": 2.6450000000000005e-06, "loss": 0.0, "num_tokens": 171119.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 9.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.20436109602451324, "kl": 0.027568168006837368, "learning_rate": 2.6500000000000005e-06, "loss": 0.0009, "num_tokens": 171367.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 9.851851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.0288685355335474, "kl": 0.0031088516116142273, "learning_rate": 2.6550000000000004e-06, "loss": 0.0001, "num_tokens": 171575.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 74.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 74.0, "completions/mean_terminated_length": 13.333333969116211, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 9.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.253559589385986, "kl": 0.027173260925337672, "learning_rate": 2.6600000000000004e-06, "loss": 0.4098, "num_tokens": 172087.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 9.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02600042335689068, "kl": 0.0344085618853569, "learning_rate": 2.6650000000000003e-06, "loss": 0.0017, "num_tokens": 172371.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 9.907407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 8.282113075256348, "kl": 0.07047861441969872, "learning_rate": 2.6700000000000003e-06, "loss": 0.0533, "num_tokens": 172712.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 9.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0131550133228302, "kl": 0.01609636191278696, "learning_rate": 2.6750000000000002e-06, "loss": 0.0008, "num_tokens": 172972.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 9.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 5.359115123748779, "kl": 0.008659072918817401, "learning_rate": 2.68e-06, "loss": 0.2106, "num_tokens": 173286.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 9.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.006103401072323322, "kl": 0.002427017781883478, "learning_rate": 2.6850000000000006e-06, "loss": 0.0001, "num_tokens": 173506.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 9.981481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 7.742459297180176, "kl": 0.03744380176067352, "learning_rate": 2.6900000000000005e-06, "loss": -0.0487, "num_tokens": 173768.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.42496395111084, "kl": 0.0502274576574564, "learning_rate": 2.6950000000000005e-06, "loss": -0.1836, "num_tokens": 174098.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 10.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03392986208200455, "kl": 0.0015879879938438535, "learning_rate": 2.7000000000000004e-06, "loss": 0.0001, "num_tokens": 174405.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 10.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.18598981201648712, "kl": 0.015024986816570163, "learning_rate": 2.7050000000000004e-06, "loss": 0.0008, "num_tokens": 174701.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 10.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 8.32227611541748, "kl": 0.013548072893172503, "learning_rate": 2.7100000000000003e-06, "loss": -0.0568, "num_tokens": 174955.0, "reward": 4.125, "reward_std": 2.25, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 2.25, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 10.074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03571789339184761, "kl": 0.0023571939673274755, "learning_rate": 2.7150000000000003e-06, "loss": 0.0001, "num_tokens": 175229.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 10.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.02958475425839424, "kl": 0.012637789361178875, "learning_rate": 2.7200000000000002e-06, "loss": 0.0006, "num_tokens": 175501.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06795376539230347, "kl": 0.002196485293097794, "learning_rate": 2.7250000000000006e-06, "loss": 0.0001, "num_tokens": 175765.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.4135410785675049, "kl": 0.009765662252902985, "learning_rate": 2.7300000000000005e-06, "loss": 0.0005, "num_tokens": 175977.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 10.148148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.025103697553277016, "kl": 0.03453299589455128, "learning_rate": 2.7350000000000005e-06, "loss": 0.0017, "num_tokens": 176261.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 10.166666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.029152870178223, "kl": 0.01284768059849739, "learning_rate": 2.7400000000000004e-06, "loss": -0.0026, "num_tokens": 176529.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 83.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 83.5, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 10.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.184909820556641, "kl": 0.048890700563788414, "learning_rate": 2.7450000000000004e-06, "loss": 0.0463, "num_tokens": 177115.0, "reward": 1.75, "reward_std": 1.7559422254562378, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.7559423446655273, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 10.203703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.2870981693267822, "kl": 0.03419908881187439, "learning_rate": 2.7500000000000004e-06, "loss": -0.0149, "num_tokens": 177497.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 10.222222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 2.8574604988098145, "kl": 0.022396015003323555, "learning_rate": 2.7550000000000003e-06, "loss": 0.0502, "num_tokens": 177850.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 9.99863338470459, "kl": 0.003788284957408905, "learning_rate": 2.7600000000000003e-06, "loss": -0.1903, "num_tokens": 178090.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 10.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.2916069030761719, "kl": 0.036970311775803566, "learning_rate": 2.7650000000000006e-06, "loss": 0.0018, "num_tokens": 178380.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 10.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.0108443982899189, "kl": 0.0028760135173797607, "learning_rate": 2.7700000000000006e-06, "loss": 0.0001, "num_tokens": 178592.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 10.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.265232086181641, "kl": 0.026174938306212425, "learning_rate": 2.7750000000000005e-06, "loss": 0.1404, "num_tokens": 178931.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.015753518790006638, "kl": 0.0002501994458725676, "learning_rate": 2.7800000000000005e-06, "loss": 0.0, "num_tokens": 179187.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 10.333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.054202526807785034, "kl": 0.00558291026391089, "learning_rate": 2.7850000000000004e-06, "loss": 0.0003, "num_tokens": 179469.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.05797901377081871, "kl": 0.002154251909814775, "learning_rate": 2.7900000000000004e-06, "loss": 0.0001, "num_tokens": 179737.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 10.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.9145588874816895, "kl": 0.011872428236529231, "learning_rate": 2.7950000000000003e-06, "loss": -0.0191, "num_tokens": 180063.0, "reward": 2.125, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 1.6007810831069946, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02927069179713726, "kl": 0.0055152177810668945, "learning_rate": 2.8000000000000003e-06, "loss": 0.0003, "num_tokens": 180299.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.06958382576704025, "kl": 0.0005728080868721008, "learning_rate": 2.8050000000000007e-06, "loss": 0.0, "num_tokens": 180511.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 11.519536018371582, "kl": 0.030706753488630056, "learning_rate": 2.8100000000000006e-06, "loss": 0.1481, "num_tokens": 180734.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 6.022866725921631, "kl": 0.026535941753536463, "learning_rate": 2.815e-06, "loss": -0.0011, "num_tokens": 181005.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 10.462962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.09410649538040161, "kl": 0.00992079044226557, "learning_rate": 2.82e-06, "loss": 0.0005, "num_tokens": 181265.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 10.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.06610658019781113, "kl": 0.0029819533228874207, "learning_rate": 2.825e-06, "loss": 0.0001, "num_tokens": 181525.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 10.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.153836488723755, "kl": 0.0007590175373479724, "learning_rate": 2.83e-06, "loss": 0.0336, "num_tokens": 181847.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 10.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.010375253856182098, "kl": 0.005743906833231449, "learning_rate": 2.835e-06, "loss": 0.0003, "num_tokens": 182159.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 10.537037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 7.5953145027160645, "kl": 0.056432055309414864, "learning_rate": 2.84e-06, "loss": 0.0914, "num_tokens": 182470.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 10.555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 4.468820571899414, "kl": 0.1308787614107132, "learning_rate": 2.845e-06, "loss": 0.137, "num_tokens": 182790.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 10.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.045729685574769974, "kl": 0.0021327821887098253, "learning_rate": 2.85e-06, "loss": 0.0001, "num_tokens": 183068.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 10.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.7444102764129639, "kl": 0.009015833958983421, "learning_rate": 2.855e-06, "loss": 0.0002, "num_tokens": 183431.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 10.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.296500563621521, "kl": 0.02709485311061144, "learning_rate": 2.86e-06, "loss": 0.0013, "num_tokens": 183816.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.75, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 10.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.380898475646973, "kl": 0.026649540290236473, "learning_rate": 2.865e-06, "loss": 0.0831, "num_tokens": 184263.0, "reward": 2.375, "reward_std": 3.8810436725616455, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 3.8810436725616455, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 10.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 7.516073703765869, "kl": 0.026560556143522263, "learning_rate": 2.87e-06, "loss": 0.1418, "num_tokens": 184568.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 10.666666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.10316488891839981, "kl": 0.0175789101049304, "learning_rate": 2.875e-06, "loss": 0.0009, "num_tokens": 184899.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 10.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.0641086101531982, "kl": 0.027505491860210896, "learning_rate": 2.88e-06, "loss": 0.0029, "num_tokens": 185198.0, "reward": 4.125, "reward_std": 5.202163219451904, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 5.202163219451904, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 10.703703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.1329067349433899, "kl": 0.018226811662316322, "learning_rate": 2.885e-06, "loss": 0.0009, "num_tokens": 185514.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 10.722222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.005374416708946228, "kl": 0.00021800837566843256, "learning_rate": 2.89e-06, "loss": 0.0, "num_tokens": 185822.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 10.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.106919765472412, "kl": 0.009536846540868282, "learning_rate": 2.8950000000000002e-06, "loss": 0.0094, "num_tokens": 186153.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014386468101292849, "kl": 5.9895217418670654e-05, "learning_rate": 2.9e-06, "loss": 0.0, "num_tokens": 186373.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 10.777777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 4.496015548706055, "kl": 0.02194218523800373, "learning_rate": 2.905e-06, "loss": 0.0705, "num_tokens": 186716.0, "reward": 3.125, "reward_std": 0.25, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 0.25, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 10.796296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.352851390838623, "kl": 0.03810996562242508, "learning_rate": 2.91e-06, "loss": -0.0082, "num_tokens": 187021.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 10.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03950609639286995, "kl": 0.00465160608291626, "learning_rate": 2.915e-06, "loss": 0.0002, "num_tokens": 187257.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 10.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.5915427803993225, "kl": 0.05513983964920044, "learning_rate": 2.92e-06, "loss": 0.0028, "num_tokens": 187473.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 10.851851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.1148410513997078, "kl": 0.015384792350232601, "learning_rate": 2.925e-06, "loss": 0.0008, "num_tokens": 187770.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 10.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654129832983017, "kl": 0.005554557195864618, "learning_rate": 2.93e-06, "loss": 0.0003, "num_tokens": 188077.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 10.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.012407872825860977, "kl": 0.0014553872752003372, "learning_rate": 2.9350000000000003e-06, "loss": 0.0001, "num_tokens": 188389.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 10.907407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.18130354583263397, "kl": 0.018729012925177813, "learning_rate": 2.9400000000000002e-06, "loss": 0.0011, "num_tokens": 188670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 10.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.011464902199804783, "kl": 0.01647359598428011, "learning_rate": 2.945e-06, "loss": 0.0008, "num_tokens": 188930.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.005494505632668734, "clip_ratio/high_mean": 0.005494505632668734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005494505632668734, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 10.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 2.5976126194000244, "kl": 0.0065139178186655045, "learning_rate": 2.95e-06, "loss": -0.119, "num_tokens": 189376.0, "reward": 0.800000011920929, "reward_std": 1.5033296346664429, "rewards/reward_combined/mean": 0.800000011920929, "rewards/reward_combined/std": 1.5033297538757324, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 10.962962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 6.174934387207031, "kl": 0.03555438295006752, "learning_rate": 2.955e-06, "loss": -0.0366, "num_tokens": 189686.0, "reward": 3.25, "reward_std": 3.4278273582458496, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 3.4278273582458496, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 10.981481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 6.519599914550781, "kl": 0.04692198894917965, "learning_rate": 2.96e-06, "loss": -0.0083, "num_tokens": 189957.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.0, "frac_reward_zero_std": 0.0, "grad_norm": 6.836474418640137, "kl": 0.02765258401632309, "learning_rate": 2.965e-06, "loss": 0.114, "num_tokens": 190231.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.1482357531785965, "kl": 0.007548679132014513, "learning_rate": 2.97e-06, "loss": 0.0004, "num_tokens": 190470.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.09669455140829086, "kl": 0.012206693179905415, "learning_rate": 2.9750000000000003e-06, "loss": 0.0006, "num_tokens": 190765.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 11.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.03816010802984238, "kl": 0.0033163258340209723, "learning_rate": 2.9800000000000003e-06, "loss": 0.0002, "num_tokens": 191001.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.13606086373329163, "kl": 0.0043918540759477764, "learning_rate": 2.9850000000000002e-06, "loss": 0.0003, "num_tokens": 191256.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 11.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.05955702066421509, "kl": 0.0011958777904510498, "learning_rate": 2.99e-06, "loss": 0.0001, "num_tokens": 191528.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 11.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 8.028421401977539, "kl": 0.0521677415817976, "learning_rate": 2.995e-06, "loss": -0.0341, "num_tokens": 191818.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 11.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.80760383605957, "kl": 0.014834481291472912, "learning_rate": 3e-06, "loss": -0.0294, "num_tokens": 192171.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 11.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 5.690886497497559, "kl": 0.016443987376987934, "learning_rate": 3.005e-06, "loss": -0.043, "num_tokens": 192463.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 11.166666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.8554577827453613, "kl": 0.3909573797136545, "learning_rate": 3.01e-06, "loss": -0.107, "num_tokens": 192787.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 8.104565620422363, "kl": 0.036157434806227684, "learning_rate": 3.0150000000000004e-06, "loss": 0.3131, "num_tokens": 193016.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 11.203703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.6586852073669434, "kl": 0.03428817819803953, "learning_rate": 3.0200000000000003e-06, "loss": 0.002, "num_tokens": 193379.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 11.222222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 4.483132839202881, "kl": 0.021460278891026974, "learning_rate": 3.0250000000000003e-06, "loss": -0.0609, "num_tokens": 193688.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 11.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.845158815383911, "kl": 0.011465704999864101, "learning_rate": 3.0300000000000002e-06, "loss": 0.0129, "num_tokens": 194010.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771130621433258, "kl": 0.011705304495990276, "learning_rate": 3.035e-06, "loss": 0.0007, "num_tokens": 194295.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 608 }, { "clip_ratio/high_max": 0.007462686393409967, "clip_ratio/high_mean": 0.007462686393409967, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007462686393409967, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.277777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 4.4301910400390625, "kl": 0.03224021941423416, "learning_rate": 3.04e-06, "loss": -0.0978, "num_tokens": 194631.0, "reward": 2.25, "reward_std": 1.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.5, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 11.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 8.349041938781738, "kl": 0.03147148061543703, "learning_rate": 3.045e-06, "loss": 0.0354, "num_tokens": 194907.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.060731902718544006, "kl": 0.0010466724634170532, "learning_rate": 3.05e-06, "loss": 0.0001, "num_tokens": 195120.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 5.039850234985352, "kl": 0.02034814329817891, "learning_rate": 3.0550000000000004e-06, "loss": 0.3058, "num_tokens": 195432.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.17057907581329346, "kl": 0.012268769787624478, "learning_rate": 3.0600000000000003e-06, "loss": 0.0006, "num_tokens": 195728.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 11.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.012006908655166626, "kl": 0.002555397804826498, "learning_rate": 3.0650000000000003e-06, "loss": 0.0001, "num_tokens": 196047.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 11.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07971962541341782, "kl": 0.004201619885861874, "learning_rate": 3.0700000000000003e-06, "loss": 0.0002, "num_tokens": 196313.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 11.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 5.6288275718688965, "kl": 0.023887116461992264, "learning_rate": 3.075e-06, "loss": 0.0049, "num_tokens": 196663.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005134259699843824, "kl": 1.4372169971466064e-05, "learning_rate": 3.08e-06, "loss": 0.0, "num_tokens": 196883.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 11.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0721551850438118, "kl": 0.012493256945163012, "learning_rate": 3.085e-06, "loss": 0.0006, "num_tokens": 197189.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 11.462962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.010592280887067318, "kl": 0.016736089251935482, "learning_rate": 3.09e-06, "loss": 0.0008, "num_tokens": 197449.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 11.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.31523987650871277, "kl": 0.01482035219669342, "learning_rate": 3.0950000000000004e-06, "loss": 0.0009, "num_tokens": 197663.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 11.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.3675620555877686, "kl": 0.009331044740974903, "learning_rate": 3.1000000000000004e-06, "loss": -0.0125, "num_tokens": 198000.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 11.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.013105317018926144, "kl": 0.0003317179362056777, "learning_rate": 3.1050000000000003e-06, "loss": 0.0, "num_tokens": 198280.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 11.537037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.4082496464252472, "kl": 0.0478957612067461, "learning_rate": 3.1100000000000003e-06, "loss": 0.0024, "num_tokens": 198549.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 11.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.12371762841939926, "kl": 0.003243985876906663, "learning_rate": 3.1150000000000002e-06, "loss": 0.0002, "num_tokens": 198865.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 11.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.6463756561279297, "kl": 0.022760625928640366, "learning_rate": 3.12e-06, "loss": 0.002, "num_tokens": 199197.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.4754486083984375, "kl": 0.026626771315932274, "learning_rate": 3.125e-06, "loss": -0.0716, "num_tokens": 199485.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 11.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.3296201229095459, "kl": 0.014200113713741302, "learning_rate": 3.13e-06, "loss": 0.0007, "num_tokens": 199695.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 11.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02637873776257038, "kl": 0.0014821384102106094, "learning_rate": 3.135e-06, "loss": 0.0001, "num_tokens": 199947.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 11.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 4.1041340827941895, "kl": 0.04533323273062706, "learning_rate": 3.1400000000000004e-06, "loss": -0.0131, "num_tokens": 200303.0, "reward": 1.25, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 1.25, "rewards/reward_combined/std": 1.190238118171692, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 11.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.180081367492676, "kl": 0.11792661249637604, "learning_rate": 3.1450000000000004e-06, "loss": 0.1482, "num_tokens": 200619.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 11.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.240811347961426, "kl": 0.024685528129339218, "learning_rate": 3.1500000000000003e-06, "loss": -0.0784, "num_tokens": 200955.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 11.703703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008210687898099422, "kl": 0.002026462461799383, "learning_rate": 3.1550000000000003e-06, "loss": 0.0001, "num_tokens": 201175.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 11.722222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.008543751202523708, "kl": 0.004745917394757271, "learning_rate": 3.1600000000000002e-06, "loss": 0.0002, "num_tokens": 201487.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.026628874242305756, "kl": 0.03466668166220188, "learning_rate": 3.165e-06, "loss": 0.0017, "num_tokens": 201771.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 11.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 10.85914421081543, "kl": 0.02291577309370041, "learning_rate": 3.17e-06, "loss": -0.1651, "num_tokens": 202011.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 11.777777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.01267574168741703, "kl": 0.0017323378124274313, "learning_rate": 3.175e-06, "loss": 0.0001, "num_tokens": 202323.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 11.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.5114508271217346, "kl": 0.04292368586175144, "learning_rate": 3.1800000000000005e-06, "loss": 0.002, "num_tokens": 202592.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 11.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.19975721836090088, "kl": 0.023725814186036587, "learning_rate": 3.1850000000000004e-06, "loss": 0.0012, "num_tokens": 202884.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 11.833333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 6.084535121917725, "kl": 0.003431393764913082, "learning_rate": 3.1900000000000004e-06, "loss": 0.351, "num_tokens": 203181.0, "reward": 3.5, "reward_std": 1.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 1.0, "step": 639 }, { "clip_ratio/high_max": 0.01260304357856512, "clip_ratio/high_mean": 0.01260304357856512, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01260304357856512, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 11.851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 2.1080126762390137, "kl": 0.009067337960004807, "learning_rate": 3.1950000000000003e-06, "loss": -0.1582, "num_tokens": 203620.0, "reward": 0.25, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 0.25, "rewards/reward_combined/std": 0.28867512941360474, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 11.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 9.149620056152344, "kl": 0.013570900075137615, "learning_rate": 3.2000000000000003e-06, "loss": 0.0948, "num_tokens": 203910.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 11.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.024699613451957703, "kl": 0.000650021422188729, "learning_rate": 3.2050000000000002e-06, "loss": 0.0, "num_tokens": 204166.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 11.907407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 8.454242706298828, "kl": 0.026605118066072464, "learning_rate": 3.21e-06, "loss": -0.0829, "num_tokens": 204430.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 11.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018074268009513617, "kl": 0.0012150896363891661, "learning_rate": 3.215e-06, "loss": 0.0001, "num_tokens": 204710.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 11.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 9.394289016723633, "kl": 0.10707097500562668, "learning_rate": 3.2200000000000005e-06, "loss": 0.0145, "num_tokens": 205023.0, "reward": 5.0, "reward_std": 3.5590262413024902, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.5590262413024902, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 11.962962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 4.39867639541626, "kl": 0.017398925498127937, "learning_rate": 3.2250000000000005e-06, "loss": 0.0411, "num_tokens": 205339.0, "reward": 4.125, "reward_std": 2.25, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 2.25, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 79.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 79.25, "completions/mean_terminated_length": 20.33333396911621, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 11.981481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 4.019659042358398, "kl": 0.05590972024947405, "learning_rate": 3.2300000000000004e-06, "loss": 0.4562, "num_tokens": 205908.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 74.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 74.5, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 12.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.0201008319854736, "kl": 0.012552737374790013, "learning_rate": 3.2350000000000004e-06, "loss": 0.4432, "num_tokens": 206418.0, "reward": 5.25, "reward_std": 5.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 5.5, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 12.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.04172515869140625, "kl": 0.005732719786465168, "learning_rate": 3.2400000000000003e-06, "loss": 0.0003, "num_tokens": 206717.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 12.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.3228820562362671, "kl": 0.03573134168982506, "learning_rate": 3.2450000000000003e-06, "loss": 0.0018, "num_tokens": 206991.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 12.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 7.065656661987305, "kl": 0.07700672186911106, "learning_rate": 3.2500000000000002e-06, "loss": 0.0947, "num_tokens": 207288.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 12.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 7.988717555999756, "kl": 0.022191676776856184, "learning_rate": 3.255e-06, "loss": 0.166, "num_tokens": 207575.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 12.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.012387805618345737, "kl": 0.01644258387386799, "learning_rate": 3.2600000000000006e-06, "loss": 0.0008, "num_tokens": 207835.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 12.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.305415391921997, "kl": 0.017648470122367144, "learning_rate": 3.2650000000000005e-06, "loss": 0.0018, "num_tokens": 208106.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 12.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.029824266210198402, "kl": 0.004355549113824964, "learning_rate": 3.2700000000000005e-06, "loss": 0.0002, "num_tokens": 208386.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 12.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 6.580255508422852, "kl": 0.09506170824170113, "learning_rate": 3.2750000000000004e-06, "loss": 0.0017, "num_tokens": 208709.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 12.166666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.07925370335578918, "kl": 0.003697037696838379, "learning_rate": 3.2800000000000004e-06, "loss": 0.0002, "num_tokens": 208969.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 12.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02828850969672203, "kl": 0.0003725931055669207, "learning_rate": 3.2850000000000003e-06, "loss": 0.0, "num_tokens": 209239.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 12.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05844036862254143, "kl": 0.002485842094756663, "learning_rate": 3.2900000000000003e-06, "loss": 0.0001, "num_tokens": 209555.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 12.222222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.031397074460983276, "kl": 0.005742944777011871, "learning_rate": 3.2950000000000002e-06, "loss": 0.0003, "num_tokens": 209791.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 12.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.19430005550384521, "kl": 0.009313431568443775, "learning_rate": 3.3000000000000006e-06, "loss": 0.0004, "num_tokens": 210051.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 12.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.17920280992984772, "kl": 0.01352369668893516, "learning_rate": 3.3050000000000005e-06, "loss": 0.0007, "num_tokens": 210374.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 72.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 72.75, "completions/mean_terminated_length": 11.666666984558105, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.277777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 4.064810752868652, "kl": 0.007596010575070977, "learning_rate": 3.3100000000000005e-06, "loss": 0.462, "num_tokens": 210917.0, "reward": 5.375, "reward_std": 5.25, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 5.25, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 12.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 7.849841594696045, "kl": 0.33733621053397655, "learning_rate": 3.3150000000000004e-06, "loss": -0.0788, "num_tokens": 211230.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 12.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 10.66535758972168, "kl": 0.04011658392846584, "learning_rate": 3.3200000000000004e-06, "loss": 0.0652, "num_tokens": 211520.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 12.333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.15042650699615479, "kl": 0.029162343591451645, "learning_rate": 3.3250000000000004e-06, "loss": 0.0015, "num_tokens": 211852.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.10258183628320694, "kl": 0.012232841923832893, "learning_rate": 3.3300000000000003e-06, "loss": 0.0006, "num_tokens": 212124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 12.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.723768711090088, "kl": 0.34921352565288544, "learning_rate": 3.3350000000000003e-06, "loss": 0.0228, "num_tokens": 212432.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 12.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.21781961619853973, "kl": 0.021847156807780266, "learning_rate": 3.3400000000000006e-06, "loss": 0.0011, "num_tokens": 212728.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 12.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 2.7965657711029053, "kl": 0.01749566290527582, "learning_rate": 3.3450000000000006e-06, "loss": -0.0069, "num_tokens": 213134.0, "reward": 0.375, "reward_std": 0.25, "rewards/reward_combined/mean": 0.375, "rewards/reward_combined/std": 0.25, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.62512731552124, "kl": 0.011569634079933167, "learning_rate": 3.3500000000000005e-06, "loss": -0.0421, "num_tokens": 213404.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 12.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 1.157044768333435, "kl": 0.09040183201432228, "learning_rate": 3.3550000000000005e-06, "loss": -0.0045, "num_tokens": 213767.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 12.462962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 9.049518585205078, "kl": 0.08896612375974655, "learning_rate": 3.3600000000000004e-06, "loss": 0.0797, "num_tokens": 214057.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 12.481481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.10739633440971375, "kl": 0.002879232168197632, "learning_rate": 3.3650000000000004e-06, "loss": 0.0001, "num_tokens": 214263.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 12.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.30027636885643005, "kl": 0.03527809772640467, "learning_rate": 3.3700000000000003e-06, "loss": 0.0019, "num_tokens": 214534.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 12.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.2751378118991852, "kl": 0.04468468949198723, "learning_rate": 3.3750000000000003e-06, "loss": 0.0021, "num_tokens": 214836.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 12.537037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 3.805227041244507, "kl": 0.010624171234667301, "learning_rate": 3.3800000000000007e-06, "loss": 0.0217, "num_tokens": 215164.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 12.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019216621294617653, "kl": 0.0013114341418258846, "learning_rate": 3.3850000000000006e-06, "loss": 0.0001, "num_tokens": 215444.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 12.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 4.073700256412849e-05, "kl": 1.430511474609375e-06, "learning_rate": 3.3900000000000006e-06, "loss": 0.0, "num_tokens": 215664.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 12.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.005073070526123, "kl": 0.023659497499465942, "learning_rate": 3.3950000000000005e-06, "loss": -0.0157, "num_tokens": 215993.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 12.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 11.43302059173584, "kl": 0.029769452288746834, "learning_rate": 3.4000000000000005e-06, "loss": 0.2973, "num_tokens": 216220.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.5, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 75.5, "completions/mean_terminated_length": 75.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 12.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.688279628753662, "kl": 0.05401912610977888, "learning_rate": 3.4050000000000004e-06, "loss": 0.3469, "num_tokens": 216746.0, "reward": 4.375, "reward_std": 3.8810436725616455, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 3.8810436725616455, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 12.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 3.8437788486480713, "kl": 0.018569111824035645, "learning_rate": 3.4100000000000004e-06, "loss": -0.0078, "num_tokens": 217093.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 12.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 4.572022438049316, "kl": 0.014789620414376259, "learning_rate": 3.4150000000000003e-06, "loss": -0.0477, "num_tokens": 217412.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 12.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.008175849914551, "kl": 0.030666828621178865, "learning_rate": 3.4200000000000007e-06, "loss": 0.0154, "num_tokens": 217762.0, "reward": 1.375, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 1.375, "rewards/reward_combined/std": 1.4361406564712524, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 68.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 68.0, "completions/mean_terminated_length": 68.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 12.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.0841970443725586, "kl": 0.008805236197076738, "learning_rate": 3.4250000000000007e-06, "loss": 0.4522, "num_tokens": 218246.0, "reward": 5.125, "reward_std": 5.75, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 5.75, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 12.722222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 5.84593391418457, "kl": 0.014060269924812019, "learning_rate": 3.4300000000000006e-06, "loss": -0.0425, "num_tokens": 218520.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 12.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.729602813720703, "kl": 0.02341025322675705, "learning_rate": 3.4350000000000006e-06, "loss": 0.0442, "num_tokens": 218867.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 12.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.033716313540935516, "kl": 0.0015222903894027695, "learning_rate": 3.44e-06, "loss": 0.0001, "num_tokens": 219175.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 12.777777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.03615173324942589, "kl": 0.00038520395173691213, "learning_rate": 3.445e-06, "loss": 0.0, "num_tokens": 219388.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 12.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00724438764154911, "kl": 0.002084410283714533, "learning_rate": 3.45e-06, "loss": 0.0001, "num_tokens": 219608.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 12.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04499228298664093, "kl": 0.00474282237701118, "learning_rate": 3.455e-06, "loss": 0.0002, "num_tokens": 219844.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 12.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.008822840638458729, "kl": 0.0001635670632822439, "learning_rate": 3.46e-06, "loss": 0.0, "num_tokens": 220100.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 12.851851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.1546824872493744, "kl": 0.014349468052387238, "learning_rate": 3.465e-06, "loss": 0.0007, "num_tokens": 220414.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 12.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.022769780829548836, "kl": 0.0023452548775821924, "learning_rate": 3.4700000000000002e-06, "loss": 0.0001, "num_tokens": 220674.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 12.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.3842354714870453, "kl": 0.0912703163921833, "learning_rate": 3.475e-06, "loss": 0.0045, "num_tokens": 220982.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 12.907407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06371401995420456, "kl": 0.015870106406509876, "learning_rate": 3.48e-06, "loss": 0.0008, "num_tokens": 221282.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 12.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.5435612201690674, "kl": 0.029072046279907227, "learning_rate": 3.485e-06, "loss": 0.0015, "num_tokens": 221526.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 12.944444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0519857220351696, "kl": 0.026654936373233795, "learning_rate": 3.49e-06, "loss": 0.0014, "num_tokens": 221822.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 12.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.012222686782479286, "kl": 0.0027244091033935547, "learning_rate": 3.495e-06, "loss": 0.0001, "num_tokens": 222034.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 12.981481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.926107168197632, "kl": 0.06518355384469032, "learning_rate": 3.5e-06, "loss": 0.0391, "num_tokens": 222393.0, "reward": 1.375, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 1.375, "rewards/reward_combined/std": 1.4361406564712524, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 13.0, "frac_reward_zero_std": 0.0, "grad_norm": 5.937601089477539, "kl": 0.061592782847583294, "learning_rate": 3.505e-06, "loss": 0.0994, "num_tokens": 222699.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 13.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.044845227152109146, "kl": 0.004706483334302902, "learning_rate": 3.5100000000000003e-06, "loss": 0.0002, "num_tokens": 223011.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 13.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.30242976546287537, "kl": 0.04299968760460615, "learning_rate": 3.5150000000000002e-06, "loss": 0.0022, "num_tokens": 223342.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 13.055555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 5.661122798919678, "kl": 0.10700121521949768, "learning_rate": 3.52e-06, "loss": -0.031, "num_tokens": 223647.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 705 }, { "clip_ratio/high_max": 0.011111111380159855, "clip_ratio/high_mean": 0.011111111380159855, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 80.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 80.0, "completions/mean_terminated_length": 21.33333396911621, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 13.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.5835976600646973, "kl": 0.06982976198196411, "learning_rate": 3.525e-06, "loss": 0.4324, "num_tokens": 224219.0, "reward": 4.875, "reward_std": 5.25, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 5.25, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 13.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.11502530425786972, "kl": 0.019980875309556723, "learning_rate": 3.53e-06, "loss": 0.001, "num_tokens": 224523.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 73.5, "completions/mean_terminated_length": 12.666666984558105, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 13.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.661816954612732, "kl": 0.002313772216439247, "learning_rate": 3.535e-06, "loss": 0.4547, "num_tokens": 225029.0, "reward": 5.875, "reward_std": 4.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 4.25, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 13.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02849768102169037, "kl": 0.0023765690275467932, "learning_rate": 3.54e-06, "loss": 0.0001, "num_tokens": 225306.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 13.148148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.22840283811092377, "kl": 0.00965544837526977, "learning_rate": 3.545e-06, "loss": 0.0005, "num_tokens": 225580.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 13.166666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.0917246341705322, "kl": 0.01081078452989459, "learning_rate": 3.5500000000000003e-06, "loss": 0.3072, "num_tokens": 225883.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1304813027381897, "kl": 0.008392083691433072, "learning_rate": 3.5550000000000003e-06, "loss": 0.0004, "num_tokens": 226145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 13.203703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 22.119352340698242, "kl": 0.06486252695322037, "learning_rate": 3.5600000000000002e-06, "loss": 0.003, "num_tokens": 226372.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.222222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.05395181477069855, "kl": 0.0008561899303458631, "learning_rate": 3.565e-06, "loss": 0.0, "num_tokens": 226640.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 13.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 12.3789701461792, "kl": 0.009366178885102272, "learning_rate": 3.57e-06, "loss": 0.4068, "num_tokens": 226891.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.024691283702850342, "kl": 0.00036153521796222776, "learning_rate": 3.575e-06, "loss": 0.0, "num_tokens": 227148.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 13.277777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 4.668971538543701, "kl": 0.008642992353998125, "learning_rate": 3.58e-06, "loss": -0.016, "num_tokens": 227466.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 13.296296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.3145133852958679, "kl": 0.03695741109549999, "learning_rate": 3.585e-06, "loss": 0.0018, "num_tokens": 227782.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007575757801532745, "clip_ratio/low_min": 0.007575757801532745, "clip_ratio/region_mean": 0.007575757801532745, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 13.314814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.1675214767456055, "kl": 0.08254183083772659, "learning_rate": 3.5900000000000004e-06, "loss": -0.0589, "num_tokens": 228132.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 13.333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.011034821160137653, "kl": 0.0007874899310991168, "learning_rate": 3.5950000000000003e-06, "loss": 0.0, "num_tokens": 228444.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 13.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.6344883441925049, "kl": 0.06143311597406864, "learning_rate": 3.6000000000000003e-06, "loss": 0.0032, "num_tokens": 228739.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 13.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.370728969573975, "kl": 0.011657073860988021, "learning_rate": 3.6050000000000002e-06, "loss": -0.0742, "num_tokens": 229016.0, "reward": 2.75, "reward_std": 3.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 3.5, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.10871630162000656, "kl": 0.010092085227370262, "learning_rate": 3.61e-06, "loss": 0.0005, "num_tokens": 229280.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 13.407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 3.791403293609619, "kl": 0.02170178573578596, "learning_rate": 3.615e-06, "loss": 0.0497, "num_tokens": 229645.0, "reward": 3.0, "reward_std": 5.446711540222168, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 5.446711540222168, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 13.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 1.0772731304168701, "kl": 0.11461967695504427, "learning_rate": 3.62e-06, "loss": 0.0061, "num_tokens": 229935.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 13.444444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 6.028342247009277, "kl": 0.014517928007990122, "learning_rate": 3.625e-06, "loss": 0.0433, "num_tokens": 230233.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 13.462962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.008157488889992237, "kl": 0.001980304718017578, "learning_rate": 3.6300000000000004e-06, "loss": 0.0001, "num_tokens": 230453.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 89.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 89.25, "completions/mean_terminated_length": 33.66666793823242, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 13.481481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.886113405227661, "kl": 0.03889167681336403, "learning_rate": 3.6350000000000003e-06, "loss": 0.0928, "num_tokens": 231026.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 13.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.15780594944953918, "kl": 0.06208759360015392, "learning_rate": 3.6400000000000003e-06, "loss": 0.003, "num_tokens": 231348.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 13.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.5536521673202515, "kl": 0.06306350696831942, "learning_rate": 3.6450000000000003e-06, "loss": 0.0033, "num_tokens": 231679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 13.537037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.148001566529274, "kl": 0.005636319518089294, "learning_rate": 3.65e-06, "loss": 0.0003, "num_tokens": 231939.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 13.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.08109533786773682, "kl": 0.00990668823942542, "learning_rate": 3.655e-06, "loss": 0.0005, "num_tokens": 232221.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 13.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.006147771142423153, "kl": 0.00034103411599062383, "learning_rate": 3.66e-06, "loss": 0.0, "num_tokens": 232528.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 13.592592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.4009535312652588, "kl": 0.054316360503435135, "learning_rate": 3.665e-06, "loss": 0.0027, "num_tokens": 232800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 13.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03054111637175083, "kl": 0.005433038575574756, "learning_rate": 3.6700000000000004e-06, "loss": 0.0003, "num_tokens": 233133.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 13.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 7.104330539703369, "kl": 0.04929225705564022, "learning_rate": 3.6750000000000004e-06, "loss": 0.1247, "num_tokens": 233413.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 13.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 3.8118488788604736, "kl": 0.13834291324019432, "learning_rate": 3.6800000000000003e-06, "loss": -0.0961, "num_tokens": 233746.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 13.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.7733123302459717, "kl": 0.08374021574854851, "learning_rate": 3.6850000000000003e-06, "loss": 0.0145, "num_tokens": 234111.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 13.685185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03661537542939186, "kl": 0.0035909198923036456, "learning_rate": 3.6900000000000002e-06, "loss": 0.0002, "num_tokens": 234381.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 13.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.3390767574310303, "kl": 0.0393237229436636, "learning_rate": 3.695e-06, "loss": -0.0245, "num_tokens": 234727.0, "reward": 3.625, "reward_std": 2.8975563049316406, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 2.8975565433502197, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 57.25, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 13.722222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 2.4769439697265625, "kl": 0.010741622652858496, "learning_rate": 3.7e-06, "loss": 0.4437, "num_tokens": 235176.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 13.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01272677443921566, "kl": 0.01856026705354452, "learning_rate": 3.705e-06, "loss": 0.0009, "num_tokens": 235473.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 13.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.006303238216787577, "kl": 0.0029411599971354008, "learning_rate": 3.7100000000000005e-06, "loss": 0.0001, "num_tokens": 235708.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 13.777777777777779, "frac_reward_zero_std": 0.0, "grad_norm": 1.6445744037628174, "kl": 0.017406440572813153, "learning_rate": 3.7150000000000004e-06, "loss": -0.0245, "num_tokens": 235988.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 13.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02423044852912426, "kl": 0.010211076587438583, "learning_rate": 3.7200000000000004e-06, "loss": 0.0005, "num_tokens": 236294.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 13.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.31984129548072815, "kl": 0.037835922092199326, "learning_rate": 3.7250000000000003e-06, "loss": 0.0021, "num_tokens": 236521.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 86.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 86.75, "completions/mean_terminated_length": 30.33333396911621, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 13.833333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.5181691646575928, "kl": 0.04244566522538662, "learning_rate": 3.7300000000000003e-06, "loss": 0.4785, "num_tokens": 237104.0, "reward": 0.925000011920929, "reward_std": 1.8227726221084595, "rewards/reward_combined/mean": 0.925000011920929, "rewards/reward_combined/std": 1.8227726221084595, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 13.851851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.14104068279266357, "kl": 0.03803800977766514, "learning_rate": 3.7350000000000002e-06, "loss": 0.0019, "num_tokens": 237404.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 13.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014288023521658033, "kl": 3.516674041748047e-06, "learning_rate": 3.74e-06, "loss": 0.0, "num_tokens": 237624.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 13.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.816287636756897, "kl": 0.013222066219896078, "learning_rate": 3.745e-06, "loss": -0.087, "num_tokens": 238061.0, "reward": 1.225000023841858, "reward_std": 1.1441882848739624, "rewards/reward_combined/mean": 1.225000023841858, "rewards/reward_combined/std": 1.1441882848739624, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 13.907407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.3327646553516388, "kl": 0.04497400484979153, "learning_rate": 3.7500000000000005e-06, "loss": 0.0023, "num_tokens": 238362.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 13.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.014154963195323944, "kl": 0.016040818765759468, "learning_rate": 3.7550000000000005e-06, "loss": 0.0008, "num_tokens": 238622.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 13.944444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.14035962522029877, "kl": 0.055351260118186474, "learning_rate": 3.7600000000000004e-06, "loss": 0.0027, "num_tokens": 238919.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 13.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.016483789309859276, "kl": 0.0009510591626167297, "learning_rate": 3.7650000000000004e-06, "loss": 0.0, "num_tokens": 239163.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 13.981481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.2297651618719101, "kl": 0.003897041082382202, "learning_rate": 3.7700000000000003e-06, "loss": 0.0002, "num_tokens": 239375.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 14.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.012371162883937359, "kl": 0.0024543404579162598, "learning_rate": 3.7750000000000003e-06, "loss": 0.0001, "num_tokens": 239587.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 14.018518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.31763219833374, "kl": 0.004710767883807421, "learning_rate": 3.7800000000000002e-06, "loss": 0.0257, "num_tokens": 239917.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 14.037037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 3.403818130493164, "kl": 0.09723297134041786, "learning_rate": 3.785e-06, "loss": -0.0852, "num_tokens": 240253.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 14.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.012420766986906528, "kl": 0.009379841387271881, "learning_rate": 3.79e-06, "loss": 0.0005, "num_tokens": 240489.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 14.074074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.015251144766807556, "kl": 0.0011615600669756532, "learning_rate": 3.7950000000000005e-06, "loss": 0.0001, "num_tokens": 240798.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 14.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.013446471653878689, "kl": 0.019816027022898197, "learning_rate": 3.8000000000000005e-06, "loss": 0.001, "num_tokens": 241090.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 14.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.14946724474430084, "kl": 0.0074563100934028625, "learning_rate": 3.8050000000000004e-06, "loss": 0.0004, "num_tokens": 241350.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 14.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.325711727142334, "kl": 0.02057038526982069, "learning_rate": 3.8100000000000004e-06, "loss": 0.3315, "num_tokens": 241725.0, "reward": 2.549999952316284, "reward_std": 1.899999976158142, "rewards/reward_combined/mean": 2.549999952316284, "rewards/reward_combined/std": 1.899999976158142, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 14.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 1.9044928550720215, "kl": 0.04235299862921238, "learning_rate": 3.815000000000001e-06, "loss": 0.0091, "num_tokens": 242161.0, "reward": 1.1500000953674316, "reward_std": 1.7058721780776978, "rewards/reward_combined/mean": 1.1500000953674316, "rewards/reward_combined/std": 1.7058721780776978, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 14.166666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.841765403747559, "kl": 0.7092879600822926, "learning_rate": 3.820000000000001e-06, "loss": 0.235, "num_tokens": 242543.0, "reward": 0.125, "reward_std": 2.4958298206329346, "rewards/reward_combined/mean": 0.125, "rewards/reward_combined/std": 2.4958298206329346, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 14.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07043575495481491, "kl": 0.012246244587004185, "learning_rate": 3.825000000000001e-06, "loss": 0.0006, "num_tokens": 242879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 14.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.10480890423059464, "kl": 0.0028305358719080687, "learning_rate": 3.830000000000001e-06, "loss": 0.0001, "num_tokens": 243137.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 14.222222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.08396655321121216, "kl": 0.04056290816515684, "learning_rate": 3.8350000000000006e-06, "loss": 0.002, "num_tokens": 243461.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 14.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 7.680726528167725, "kl": 0.03983481228351593, "learning_rate": 3.8400000000000005e-06, "loss": 0.1082, "num_tokens": 243739.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 14.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01215905137360096, "kl": 0.0024683475494384766, "learning_rate": 3.8450000000000005e-06, "loss": 0.0001, "num_tokens": 243951.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 14.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.07246628403663635, "kl": 0.004540033405646682, "learning_rate": 3.85e-06, "loss": 0.0002, "num_tokens": 244185.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 14.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.975471019744873, "kl": 0.057519715279340744, "learning_rate": 3.855e-06, "loss": 0.0297, "num_tokens": 244525.0, "reward": 1.75, "reward_std": 3.175426483154297, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 3.175426483154297, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 14.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015441454015672207, "kl": 0.000135079026222229, "learning_rate": 3.86e-06, "loss": 0.0, "num_tokens": 244781.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 14.333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.5668203830718994, "kl": 0.0037408119533210993, "learning_rate": 3.865e-06, "loss": 0.0482, "num_tokens": 245064.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 14.351851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 5.479597091674805, "kl": 0.1465783715248108, "learning_rate": 3.87e-06, "loss": 0.1103, "num_tokens": 245381.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 14.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.26076439023017883, "kl": 0.014173740521073341, "learning_rate": 3.875e-06, "loss": 0.0008, "num_tokens": 245645.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 14.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 7.3671159744262695, "kl": 0.03740457259118557, "learning_rate": 3.88e-06, "loss": 0.0086, "num_tokens": 245963.0, "reward": 5.375, "reward_std": 2.75, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.75, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 14.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.000147889819345437, "kl": 4.246830940246582e-06, "learning_rate": 3.885e-06, "loss": 0.0, "num_tokens": 246183.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 14.425925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 8.670975685119629, "kl": 0.05827266350388527, "learning_rate": 3.89e-06, "loss": 0.1872, "num_tokens": 246493.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 14.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.08398685604333878, "kl": 0.021320071537047625, "learning_rate": 3.895000000000001e-06, "loss": 0.0011, "num_tokens": 246783.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 14.462962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 4.373116493225098, "kl": 0.04917445546016097, "learning_rate": 3.900000000000001e-06, "loss": 0.0441, "num_tokens": 247133.0, "reward": 2.125, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 1.6007810831069946, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 14.481481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 10.557184219360352, "kl": 0.10524484142661095, "learning_rate": 3.905000000000001e-06, "loss": -0.0896, "num_tokens": 247441.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 14.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.018286971375346184, "kl": 0.0029752476839348674, "learning_rate": 3.910000000000001e-06, "loss": 0.0002, "num_tokens": 247707.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 14.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07146304845809937, "kl": 0.0903204008936882, "learning_rate": 3.915000000000001e-06, "loss": 0.0045, "num_tokens": 248073.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 14.537037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.03269721940159798, "kl": 0.003276268020272255, "learning_rate": 3.920000000000001e-06, "loss": 0.0002, "num_tokens": 248391.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 14.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.06375214457511902, "kl": 0.004093641910003498, "learning_rate": 3.9250000000000005e-06, "loss": 0.0002, "num_tokens": 248698.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 14.574074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.5278058052062988, "kl": 0.05477179028093815, "learning_rate": 3.9300000000000005e-06, "loss": 0.0026, "num_tokens": 248985.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008771929889917374, "clip_ratio/low_min": 0.008771929889917374, "clip_ratio/region_mean": 0.008771929889917374, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 14.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.680300712585449, "kl": 0.032285021618008614, "learning_rate": 3.9350000000000004e-06, "loss": -0.158, "num_tokens": 249317.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 14.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.007741305511444807, "kl": 0.0020009339787065983, "learning_rate": 3.94e-06, "loss": 0.0001, "num_tokens": 249537.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 14.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.441678524017334, "kl": 0.04412580654025078, "learning_rate": 3.945e-06, "loss": 0.0793, "num_tokens": 249841.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 14.648148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 3.5013630390167236, "kl": 0.010462728329002857, "learning_rate": 3.95e-06, "loss": 0.0386, "num_tokens": 250165.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.75, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 14.666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.520061731338501, "kl": 0.056867774575948715, "learning_rate": 3.955e-06, "loss": 0.299, "num_tokens": 250588.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 14.685185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.6459870934486389, "kl": 0.05611857399344444, "learning_rate": 3.96e-06, "loss": 0.0032, "num_tokens": 250807.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 14.703703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.016079023480415344, "kl": 0.001964849012438208, "learning_rate": 3.965e-06, "loss": 0.0001, "num_tokens": 251095.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 14.722222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.08126889169216156, "kl": 0.02591540291905403, "learning_rate": 3.97e-06, "loss": 0.0013, "num_tokens": 251355.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 14.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.852466344833374, "kl": 0.024628701619803905, "learning_rate": 3.975000000000001e-06, "loss": 0.0781, "num_tokens": 251700.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 14.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.817673683166504, "kl": 0.06099597131833434, "learning_rate": 3.980000000000001e-06, "loss": -0.0144, "num_tokens": 251983.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 14.777777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.012931613251566887, "kl": 0.018622832372784615, "learning_rate": 3.985000000000001e-06, "loss": 0.0009, "num_tokens": 252255.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 14.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.2518782317638397, "kl": 0.02479848451912403, "learning_rate": 3.990000000000001e-06, "loss": 0.0013, "num_tokens": 252516.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 14.814814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06475525349378586, "kl": 0.0064600761397741735, "learning_rate": 3.995000000000001e-06, "loss": 0.0004, "num_tokens": 252816.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 14.833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.3029751479625702, "kl": 0.01974543184041977, "learning_rate": 4.000000000000001e-06, "loss": 0.0011, "num_tokens": 253065.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 14.851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 1.7521353960037231, "kl": 0.005116186337545514, "learning_rate": 4.005000000000001e-06, "loss": 0.2891, "num_tokens": 253585.0, "reward": 0.25, "reward_std": 2.872281312942505, "rewards/reward_combined/mean": 0.25, "rewards/reward_combined/std": 2.872281312942505, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 14.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.025308286771178246, "kl": 0.009261089842766523, "learning_rate": 4.0100000000000006e-06, "loss": 0.0005, "num_tokens": 253891.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 14.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.2860097885131836, "kl": 0.04894896596670151, "learning_rate": 4.0150000000000005e-06, "loss": 0.0024, "num_tokens": 254155.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 14.907407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.042437098920345306, "kl": 0.027062603272497654, "learning_rate": 4.0200000000000005e-06, "loss": 0.0014, "num_tokens": 254487.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 14.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.18794283270835876, "kl": 0.004791490733623505, "learning_rate": 4.0250000000000004e-06, "loss": 0.0002, "num_tokens": 254699.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 14.944444444444445, "frac_reward_zero_std": 0.0, "grad_norm": 5.970949649810791, "kl": 0.025737378746271133, "learning_rate": 4.03e-06, "loss": 0.1626, "num_tokens": 254977.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 14.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.046903565526008606, "kl": 0.0009288634173572063, "learning_rate": 4.035e-06, "loss": 0.0, "num_tokens": 255247.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 14.981481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.07680512219667435, "kl": 0.00451252399943769, "learning_rate": 4.04e-06, "loss": 0.0002, "num_tokens": 255523.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 15.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.16044236719608307, "kl": 0.013277128338813782, "learning_rate": 4.045e-06, "loss": 0.0007, "num_tokens": 255727.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 15.018518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03446204215288162, "kl": 0.0012647882103919983, "learning_rate": 4.05e-06, "loss": 0.0001, "num_tokens": 255987.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 15.037037037037036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0913456603884697, "kl": 0.02752254717051983, "learning_rate": 4.055000000000001e-06, "loss": 0.0014, "num_tokens": 256299.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 15.055555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.2127007395029068, "kl": 0.03157759318128228, "learning_rate": 4.060000000000001e-06, "loss": 0.0016, "num_tokens": 256624.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 15.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.106165409088135, "kl": 0.1654473477974534, "learning_rate": 4.065e-06, "loss": 0.035, "num_tokens": 256885.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 15.092592592592593, "frac_reward_zero_std": 1.0, "grad_norm": 0.13106311857700348, "kl": 0.0363545548170805, "learning_rate": 4.07e-06, "loss": 0.0018, "num_tokens": 257187.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 15.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 6.2002482414245605, "kl": 0.010145367588847876, "learning_rate": 4.075e-06, "loss": 0.1196, "num_tokens": 257516.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 15.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.131065234541893, "kl": 0.007729876087978482, "learning_rate": 4.08e-06, "loss": 0.0004, "num_tokens": 257776.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0060240961611270905, "clip_ratio/high_mean": 0.0060240961611270905, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0060240961611270905, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 15.148148148148149, "frac_reward_zero_std": 0.0, "grad_norm": 3.633340835571289, "kl": 0.04100535297766328, "learning_rate": 4.085e-06, "loss": -0.0039, "num_tokens": 258132.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 15.166666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.009943247772753239, "kl": 0.0029736459255218506, "learning_rate": 4.09e-06, "loss": 0.0001, "num_tokens": 258344.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 15.185185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00609003659337759, "kl": 0.002708655665628612, "learning_rate": 4.095e-06, "loss": 0.0001, "num_tokens": 258564.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 15.203703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.24810001254081726, "kl": 0.040893979370594025, "learning_rate": 4.1e-06, "loss": 0.002, "num_tokens": 258780.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 15.222222222222221, "frac_reward_zero_std": 0.0, "grad_norm": 1.0826128721237183, "kl": 0.12609761208295822, "learning_rate": 4.1050000000000005e-06, "loss": -0.0029, "num_tokens": 259143.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.75, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 15.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.1853458881378174, "kl": 0.04248490184545517, "learning_rate": 4.1100000000000005e-06, "loss": -0.016, "num_tokens": 259610.0, "reward": 1.350000023841858, "reward_std": 1.5154757499694824, "rewards/reward_combined/mean": 1.350000023841858, "rewards/reward_combined/std": 1.5154757499694824, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 15.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.008526991121470928, "kl": 0.009859099984169006, "learning_rate": 4.115e-06, "loss": 0.0005, "num_tokens": 259846.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 15.277777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.07569072395563126, "kl": 0.007421077461913228, "learning_rate": 4.12e-06, "loss": 0.0004, "num_tokens": 260146.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.188582897186279, "kl": 0.05018555000424385, "learning_rate": 4.125e-06, "loss": -0.1072, "num_tokens": 260424.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 15.314814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002369281864957884, "kl": 1.5504658222198486e-05, "learning_rate": 4.13e-06, "loss": 0.0, "num_tokens": 260644.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 15.333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 5.14414119720459, "kl": 0.05915108695626259, "learning_rate": 4.135e-06, "loss": 0.0275, "num_tokens": 260943.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 15.351851851851851, "frac_reward_zero_std": 1.0, "grad_norm": 0.03864609822630882, "kl": 0.028958570212125778, "learning_rate": 4.14e-06, "loss": 0.0015, "num_tokens": 261237.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.007692307699471712, "clip_ratio/high_mean": 0.007692307699471712, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007692307699471712, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 15.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.989077568054199, "kl": 0.10456137731671333, "learning_rate": 4.145e-06, "loss": -0.0351, "num_tokens": 261581.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 15.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.5187484622001648, "kl": 0.04340506950393319, "learning_rate": 4.15e-06, "loss": 0.0022, "num_tokens": 261910.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 15.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.908170759677887, "kl": 0.06906316801905632, "learning_rate": 4.155e-06, "loss": 0.0032, "num_tokens": 262153.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 15.425925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.008886203169822693, "kl": 0.0015581553452648222, "learning_rate": 4.16e-06, "loss": 0.0001, "num_tokens": 262435.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 15.444444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.1633414924144745, "kl": 0.026020605117082596, "learning_rate": 4.165e-06, "loss": 0.0013, "num_tokens": 262707.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 15.462962962962964, "frac_reward_zero_std": 0.0, "grad_norm": 5.025596618652344, "kl": 0.07242384925484657, "learning_rate": 4.17e-06, "loss": -0.012, "num_tokens": 263009.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 15.481481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.3780198097229004, "kl": 0.046786027029156685, "learning_rate": 4.175e-06, "loss": 0.0128, "num_tokens": 263342.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05383719503879547, "kl": 0.008589566685259342, "learning_rate": 4.18e-06, "loss": 0.0004, "num_tokens": 263624.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.518518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03707544878125191, "kl": 0.01934585440903902, "learning_rate": 4.185000000000001e-06, "loss": 0.001, "num_tokens": 263896.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 15.537037037037036, "frac_reward_zero_std": 0.0, "grad_norm": 4.9121317863464355, "kl": 0.1459026150405407, "learning_rate": 4.1900000000000005e-06, "loss": 0.0322, "num_tokens": 264217.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 15.555555555555555, "frac_reward_zero_std": 1.0, "grad_norm": 0.018496012315154076, "kl": 0.001965395174920559, "learning_rate": 4.1950000000000005e-06, "loss": 0.0001, "num_tokens": 264497.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 85.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 85.75, "completions/mean_terminated_length": 29.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 15.574074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.4332661628723145, "kl": 0.0651512686163187, "learning_rate": 4.2000000000000004e-06, "loss": 0.4593, "num_tokens": 265092.0, "reward": 3.5, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 4.690415859222412, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 15.592592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 6.422827243804932, "kl": 0.14921009168028831, "learning_rate": 4.205e-06, "loss": 0.1605, "num_tokens": 265416.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 15.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 9.314942359924316, "kl": 0.045185595750808716, "learning_rate": 4.21e-06, "loss": 0.2747, "num_tokens": 265673.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.003748895600438118, "kl": 0.00021582841145573184, "learning_rate": 4.215e-06, "loss": 0.0, "num_tokens": 265929.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 15.648148148148149, "frac_reward_zero_std": 1.0, "grad_norm": 0.12394071370363235, "kl": 0.010921970591880381, "learning_rate": 4.22e-06, "loss": 0.0005, "num_tokens": 266218.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.666666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.06777188181877136, "kl": 0.00659808237105608, "learning_rate": 4.225e-06, "loss": 0.0003, "num_tokens": 266520.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 15.685185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.732229709625244, "kl": 0.04574473015964031, "learning_rate": 4.23e-06, "loss": -0.0719, "num_tokens": 266825.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 15.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.1197123527526855, "kl": 0.046001091599464417, "learning_rate": 4.235e-06, "loss": 0.0972, "num_tokens": 267195.0, "reward": 1.75, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.4433757066726685, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 15.722222222222221, "frac_reward_zero_std": 1.0, "grad_norm": 0.010410189628601074, "kl": 0.027666288428008556, "learning_rate": 4.24e-06, "loss": 0.0014, "num_tokens": 267463.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 15.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279684603214264, "kl": 0.0033725574612617493, "learning_rate": 4.245e-06, "loss": 0.0002, "num_tokens": 267675.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 15.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.040572792291641235, "kl": 0.005237758159637451, "learning_rate": 4.25e-06, "loss": 0.0003, "num_tokens": 267911.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 15.777777777777779, "frac_reward_zero_std": 1.0, "grad_norm": 0.032363731414079666, "kl": 0.0011138220434077084, "learning_rate": 4.255e-06, "loss": 0.0001, "num_tokens": 268171.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.796296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03603840991854668, "kl": 0.00612452020868659, "learning_rate": 4.26e-06, "loss": 0.0003, "num_tokens": 268470.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 15.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.659008026123047, "kl": 0.006250419013667852, "learning_rate": 4.265000000000001e-06, "loss": 0.2976, "num_tokens": 268836.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 15.833333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 5.331209659576416, "kl": 0.08920078724622726, "learning_rate": 4.270000000000001e-06, "loss": 0.0068, "num_tokens": 269177.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 15.851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 3.1684763431549072, "kl": 0.08064413443207741, "learning_rate": 4.2750000000000006e-06, "loss": -0.0299, "num_tokens": 269486.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 15.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.24997283518314362, "kl": 0.04667002893984318, "learning_rate": 4.2800000000000005e-06, "loss": 0.0023, "num_tokens": 269818.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 15.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 1.3770509958267212, "kl": 0.20949265826493502, "learning_rate": 4.2850000000000005e-06, "loss": 0.0101, "num_tokens": 270109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 15.907407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958947122097015, "kl": 0.0038124948041513562, "learning_rate": 4.2900000000000004e-06, "loss": 0.0002, "num_tokens": 270371.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 15.925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.022981787100434303, "kl": 0.007213897537440062, "learning_rate": 4.295e-06, "loss": 0.0004, "num_tokens": 270655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 15.944444444444445, "frac_reward_zero_std": 1.0, "grad_norm": 0.09707469493150711, "kl": 0.0023806244134902954, "learning_rate": 4.3e-06, "loss": 0.0001, "num_tokens": 270863.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.962962962962964, "frac_reward_zero_std": 1.0, "grad_norm": 0.02495725452899933, "kl": 0.0006199826602824032, "learning_rate": 4.305e-06, "loss": 0.0, "num_tokens": 271133.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 15.981481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.3438933193683624, "kl": 0.05370526388287544, "learning_rate": 4.31e-06, "loss": 0.0029, "num_tokens": 271398.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 16.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.005721221677958965, "kl": 0.00033278313640039414, "learning_rate": 4.315e-06, "loss": 0.0, "num_tokens": 271705.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 16.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.772317409515381, "kl": 0.007601015968248248, "learning_rate": 4.32e-06, "loss": -0.0188, "num_tokens": 271986.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 16.037037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 4.076395034790039, "kl": 0.009831957519054413, "learning_rate": 4.325e-06, "loss": 0.0349, "num_tokens": 272275.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 16.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.006879337597638369, "kl": 0.010098949074745178, "learning_rate": 4.33e-06, "loss": 0.0005, "num_tokens": 272511.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 16.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06993933767080307, "kl": 0.04259549267590046, "learning_rate": 4.335e-06, "loss": 0.0021, "num_tokens": 272807.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 16.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.040371522307395935, "kl": 0.004392934730276465, "learning_rate": 4.34e-06, "loss": 0.0002, "num_tokens": 273043.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 16.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.289167881011963, "kl": 0.08902685157954693, "learning_rate": 4.345000000000001e-06, "loss": -0.021, "num_tokens": 273330.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 16.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.370597839355469, "kl": 0.005719722015783191, "learning_rate": 4.350000000000001e-06, "loss": 0.0405, "num_tokens": 273602.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 16.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 11.703963279724121, "kl": 0.014209914952516556, "learning_rate": 4.355000000000001e-06, "loss": -0.0732, "num_tokens": 273853.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 16.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.009109634906053543, "kl": 0.0004607571318047121, "learning_rate": 4.360000000000001e-06, "loss": 0.0, "num_tokens": 274165.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 16.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.059508174657821655, "kl": 0.0060621666489169, "learning_rate": 4.3650000000000006e-06, "loss": 0.0003, "num_tokens": 274444.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 16.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 6.767501354217529, "kl": 0.21750788390636444, "learning_rate": 4.3700000000000005e-06, "loss": -0.0149, "num_tokens": 274752.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 16.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.11726614087820053, "kl": 0.01764029450714588, "learning_rate": 4.3750000000000005e-06, "loss": 0.0009, "num_tokens": 275077.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 16.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002382801176281646, "kl": 1.9341707229614258e-05, "learning_rate": 4.38e-06, "loss": 0.0, "num_tokens": 275297.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 16.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.324201822280884, "kl": 0.11101634800434113, "learning_rate": 4.385e-06, "loss": 0.0614, "num_tokens": 275635.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 16.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 6.116613388061523, "kl": 0.42608632147312164, "learning_rate": 4.39e-06, "loss": 0.2841, "num_tokens": 275998.0, "reward": 4.875, "reward_std": 3.5443618297576904, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.5443618297576904, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 16.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.28974246978759766, "kl": 0.03467692621052265, "learning_rate": 4.395e-06, "loss": 0.0019, "num_tokens": 276227.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 16.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068334732204675674, "kl": 0.002387493848800659, "learning_rate": 4.4e-06, "loss": 0.0001, "num_tokens": 276447.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 16.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.018854912370443344, "kl": 0.017495368607342243, "learning_rate": 4.405e-06, "loss": 0.0009, "num_tokens": 276719.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 16.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1907254457473755, "kl": 0.007228721398860216, "learning_rate": 4.41e-06, "loss": 0.0003, "num_tokens": 276979.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 16.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.026545504108071327, "kl": 0.0012042553425999358, "learning_rate": 4.415e-06, "loss": 0.0001, "num_tokens": 277286.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 16.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.460348516702652, "kl": 0.09464705362915993, "learning_rate": 4.42e-06, "loss": 0.0049, "num_tokens": 277584.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 16.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.01875476911664009, "kl": 0.00021392107009887695, "learning_rate": 4.425e-06, "loss": 0.0, "num_tokens": 277796.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 16.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.08975128084421158, "kl": 0.010617698077112436, "learning_rate": 4.430000000000001e-06, "loss": 0.0005, "num_tokens": 278104.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 16.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 5.179082870483398, "kl": 0.08234044909477234, "learning_rate": 4.435000000000001e-06, "loss": 0.1755, "num_tokens": 278475.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 16.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.16697172820568085, "kl": 0.047386908903717995, "learning_rate": 4.440000000000001e-06, "loss": 0.0024, "num_tokens": 278771.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 16.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01870463788509369, "kl": 0.0016571240266785026, "learning_rate": 4.445000000000001e-06, "loss": 0.0001, "num_tokens": 279051.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 16.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.16704799234867096, "kl": 0.02155154338106513, "learning_rate": 4.450000000000001e-06, "loss": 0.001, "num_tokens": 279375.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 16.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010847998782992363, "kl": 0.00239679217338562, "learning_rate": 4.4550000000000005e-06, "loss": 0.0001, "num_tokens": 279587.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 16.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.022294873371720314, "kl": 0.006411749869585037, "learning_rate": 4.4600000000000005e-06, "loss": 0.0003, "num_tokens": 279871.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 16.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.13712498545646667, "kl": 0.005332246597390622, "learning_rate": 4.4650000000000004e-06, "loss": 0.0003, "num_tokens": 280128.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 16.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.21019329130649567, "kl": 0.02870894642546773, "learning_rate": 4.47e-06, "loss": 0.0017, "num_tokens": 280409.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 16.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.07542246580123901, "kl": 0.008213723311200738, "learning_rate": 4.475e-06, "loss": 0.0004, "num_tokens": 280728.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 16.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.39923930168151855, "kl": 0.05841900780797005, "learning_rate": 4.48e-06, "loss": 0.0029, "num_tokens": 281033.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 16.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.027436692267656326, "kl": 0.0018590688705444336, "learning_rate": 4.485e-06, "loss": 0.0001, "num_tokens": 281241.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 16.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.026138639077544212, "kl": 0.0018165380170103163, "learning_rate": 4.49e-06, "loss": 0.0001, "num_tokens": 281501.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 16.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.02029547095298767, "kl": 0.08715720847249031, "learning_rate": 4.495e-06, "loss": 0.0044, "num_tokens": 281865.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 16.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.09936147928237915, "kl": 0.015672791749238968, "learning_rate": 4.5e-06, "loss": 0.0008, "num_tokens": 282137.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 16.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.002962617203593254, "kl": 0.018214997835457325, "learning_rate": 4.505e-06, "loss": 0.0009, "num_tokens": 282397.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 16.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.23695170879364014, "kl": 0.01777654141187668, "learning_rate": 4.510000000000001e-06, "loss": 0.001, "num_tokens": 282661.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 16.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.346383571624756, "kl": 0.030936136841773987, "learning_rate": 4.515000000000001e-06, "loss": 0.0165, "num_tokens": 282953.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 16.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.573971748352051, "kl": 0.02019700314849615, "learning_rate": 4.520000000000001e-06, "loss": 0.0232, "num_tokens": 283302.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 16.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 1.1474679708480835, "kl": 0.24701471999287605, "learning_rate": 4.525000000000001e-06, "loss": 0.042, "num_tokens": 283616.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 16.796296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 1.4319310188293457, "kl": 0.02976155374199152, "learning_rate": 4.530000000000001e-06, "loss": -0.0392, "num_tokens": 284059.0, "reward": 1.975000023841858, "reward_std": 1.053169846534729, "rewards/reward_combined/mean": 1.975000023841858, "rewards/reward_combined/std": 1.053169846534729, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 16.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.06275501102209091, "kl": 0.021888832561671734, "learning_rate": 4.535000000000001e-06, "loss": 0.0011, "num_tokens": 284327.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 16.833333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 5.788454055786133, "kl": 0.07020635157823563, "learning_rate": 4.540000000000001e-06, "loss": -0.2032, "num_tokens": 284641.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 16.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.8611414432525635, "kl": 0.06362941116094589, "learning_rate": 4.5450000000000005e-06, "loss": -0.0081, "num_tokens": 284973.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 16.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.8236982822418213, "kl": 0.10282353311777115, "learning_rate": 4.5500000000000005e-06, "loss": 0.1768, "num_tokens": 285329.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 16.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.17693288624286652, "kl": 0.0077920351177453995, "learning_rate": 4.5550000000000004e-06, "loss": 0.0004, "num_tokens": 285627.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 16.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 4.6667656898498535, "kl": 0.08277727290987968, "learning_rate": 4.56e-06, "loss": 0.0371, "num_tokens": 285971.0, "reward": 4.375, "reward_std": 2.428133726119995, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 2.428133726119995, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 16.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 1.5462397336959839, "kl": 0.14007742842659354, "learning_rate": 4.565e-06, "loss": 0.008, "num_tokens": 286237.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 16.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.3838156759738922, "kl": 0.02852160087786615, "learning_rate": 4.57e-06, "loss": 0.0018, "num_tokens": 286509.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 16.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 5.9228129386901855, "kl": 0.04898169403895736, "learning_rate": 4.575e-06, "loss": 0.0527, "num_tokens": 286819.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011111111380159855, "clip_ratio/low_min": 0.011111111380159855, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 16.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.8547072410583496, "kl": 0.012318878434598446, "learning_rate": 4.58e-06, "loss": 0.0936, "num_tokens": 287153.0, "reward": 4.5, "reward_std": 2.345207929611206, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.345207929611206, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 17.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.586088180541992, "kl": 0.09591428190469742, "learning_rate": 4.585e-06, "loss": 0.0123, "num_tokens": 287475.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.004580376204103231, "kl": 0.00015791505575180054, "learning_rate": 4.590000000000001e-06, "loss": 0.0, "num_tokens": 287687.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 17.037037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 4.9544267654418945, "kl": 0.017200525850057602, "learning_rate": 4.595000000000001e-06, "loss": 0.1695, "num_tokens": 287960.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 17.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.007309808395802975, "kl": 0.0020573020447045565, "learning_rate": 4.600000000000001e-06, "loss": 0.0001, "num_tokens": 288180.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 17.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.010817904956638813, "kl": 0.0027435272932052612, "learning_rate": 4.605000000000001e-06, "loss": 0.0001, "num_tokens": 288392.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 17.09259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 7.2395758628845215, "kl": 0.027951393276453018, "learning_rate": 4.610000000000001e-06, "loss": 0.0456, "num_tokens": 288733.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 17.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375905990600586, "kl": 0.08811518922448158, "learning_rate": 4.615000000000001e-06, "loss": 0.1191, "num_tokens": 289080.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 17.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.5054006576538086, "kl": 0.0995265431702137, "learning_rate": 4.620000000000001e-06, "loss": 0.154, "num_tokens": 289427.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 17.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07191174477338791, "kl": 0.005768002010881901, "learning_rate": 4.625000000000001e-06, "loss": 0.0003, "num_tokens": 289703.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 17.166666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 8.865012168884277, "kl": 0.04980063438415527, "learning_rate": 4.6300000000000006e-06, "loss": -0.1026, "num_tokens": 290040.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 17.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.003205676097422838, "kl": 0.018184813670814037, "learning_rate": 4.6350000000000005e-06, "loss": 0.0009, "num_tokens": 290300.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 17.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0389740988612175, "kl": 0.004672963172197342, "learning_rate": 4.6400000000000005e-06, "loss": 0.0002, "num_tokens": 290630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 17.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.3501877784729004, "kl": 0.18326909840106964, "learning_rate": 4.645e-06, "loss": 0.0118, "num_tokens": 290943.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 17.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09225224703550339, "kl": 0.028808110393583775, "learning_rate": 4.65e-06, "loss": 0.0014, "num_tokens": 291255.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 17.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.8429964780807495, "kl": 0.22731666266918182, "learning_rate": 4.655e-06, "loss": 0.0118, "num_tokens": 291585.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 17.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.10553164035081863, "kl": 0.0115192960947752, "learning_rate": 4.66e-06, "loss": 0.0006, "num_tokens": 291867.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 17.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.04552564024925232, "kl": 0.0008853524923324585, "learning_rate": 4.665e-06, "loss": 0.0, "num_tokens": 292123.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 17.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.11315102130174637, "kl": 0.02815634198486805, "learning_rate": 4.670000000000001e-06, "loss": 0.0014, "num_tokens": 292453.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 17.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.09609337151050568, "kl": 0.022632768377661705, "learning_rate": 4.675000000000001e-06, "loss": 0.0011, "num_tokens": 292770.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 17.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07105891406536102, "kl": 0.018622465431690216, "learning_rate": 4.680000000000001e-06, "loss": 0.0009, "num_tokens": 293058.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 72.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 72.0, "completions/mean_terminated_length": 10.666666984558105, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 17.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.0524377822875977, "kl": 0.01602680142968893, "learning_rate": 4.685000000000001e-06, "loss": 0.4479, "num_tokens": 293566.0, "reward": 7.125, "reward_std": 0.75, "rewards/reward_combined/mean": 7.125, "rewards/reward_combined/std": 0.75, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 17.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.094427108764648, "kl": 0.016079598106443882, "learning_rate": 4.69e-06, "loss": 0.0033, "num_tokens": 293857.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 87.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 87.75, "completions/mean_terminated_length": 31.666667938232422, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 17.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 2.6764514446258545, "kl": 0.12799622118473053, "learning_rate": 4.695e-06, "loss": 0.239, "num_tokens": 294444.0, "reward": -0.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": -0.25, "rewards/reward_combined/std": 2.598076105117798, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 17.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.09701244533061981, "kl": 0.008743939688429236, "learning_rate": 4.7e-06, "loss": 0.0004, "num_tokens": 294715.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 17.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.05021260306239128, "kl": 0.011681301053613424, "learning_rate": 4.705e-06, "loss": 0.0006, "num_tokens": 295017.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 17.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.8748624324798584, "kl": 0.020303060300648212, "learning_rate": 4.71e-06, "loss": -0.0057, "num_tokens": 295357.0, "reward": 2.125, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 1.6007810831069946, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 17.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.01041316986084, "kl": 0.0825581643730402, "learning_rate": 4.715e-06, "loss": -0.0066, "num_tokens": 295663.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 17.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.6712485551834106, "kl": 0.04359029233455658, "learning_rate": 4.7200000000000005e-06, "loss": -0.0026, "num_tokens": 296116.0, "reward": 2.5999999046325684, "reward_std": 0.4618801772594452, "rewards/reward_combined/mean": 2.5999999046325684, "rewards/reward_combined/std": 0.4618801772594452, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 17.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 8.650443077087402, "kl": 0.06182192172855139, "learning_rate": 4.7250000000000005e-06, "loss": 0.1705, "num_tokens": 296401.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 17.537037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 5.052463531494141, "kl": 0.06261061690747738, "learning_rate": 4.7300000000000005e-06, "loss": 0.0946, "num_tokens": 296674.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 17.555555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 8.714799880981445, "kl": 0.02572388923726976, "learning_rate": 4.735e-06, "loss": 0.2086, "num_tokens": 296917.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 17.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.19431650638580322, "kl": 0.016289540566504, "learning_rate": 4.74e-06, "loss": 0.0008, "num_tokens": 297183.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 17.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.7515743970870972, "kl": 0.06096542626619339, "learning_rate": 4.745e-06, "loss": 0.0042, "num_tokens": 297390.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08670561760663986, "kl": 0.007786396075971425, "learning_rate": 4.75e-06, "loss": 0.0002, "num_tokens": 297644.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 17.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.023190539330244064, "kl": 0.000953354494413361, "learning_rate": 4.755e-06, "loss": 0.0, "num_tokens": 297951.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 17.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03694096580147743, "kl": 0.004146612307522446, "learning_rate": 4.76e-06, "loss": 0.0002, "num_tokens": 298233.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 17.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.08695261180400848, "kl": 0.004189092665910721, "learning_rate": 4.765e-06, "loss": 0.0002, "num_tokens": 298477.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 17.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.04295063018798828, "kl": 0.010230758227407932, "learning_rate": 4.77e-06, "loss": 0.0005, "num_tokens": 298795.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 17.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 3.1890711784362793, "kl": 0.12818191945552826, "learning_rate": 4.775e-06, "loss": 0.0763, "num_tokens": 299132.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 17.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.651560068130493, "kl": 0.07588614523410797, "learning_rate": 4.78e-06, "loss": 0.1625, "num_tokens": 299471.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 17.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.044175535440444946, "kl": 0.003025888028787449, "learning_rate": 4.785e-06, "loss": 0.0002, "num_tokens": 299787.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 17.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07941427081823349, "kl": 0.006509590020868927, "learning_rate": 4.79e-06, "loss": 0.0003, "num_tokens": 300083.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 17.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 1.7821283340454102, "kl": 0.19773155450820923, "learning_rate": 4.795e-06, "loss": 0.0107, "num_tokens": 300448.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 17.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.05966145172715187, "kl": 0.002263894653879106, "learning_rate": 4.800000000000001e-06, "loss": 0.0001, "num_tokens": 300704.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 17.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.04099452123045921, "kl": 0.0058186890091747046, "learning_rate": 4.805000000000001e-06, "loss": 0.0003, "num_tokens": 301008.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001834886788856238, "kl": 1.0870397090911865e-05, "learning_rate": 4.8100000000000005e-06, "loss": 0.0, "num_tokens": 301228.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.069172382354736, "kl": 0.16043394058942795, "learning_rate": 4.8150000000000005e-06, "loss": 0.3487, "num_tokens": 301461.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 17.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.122995376586914, "kl": 0.14653189852833748, "learning_rate": 4.8200000000000004e-06, "loss": 0.0328, "num_tokens": 301773.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 17.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 2.2828242778778076, "kl": 0.1684875637292862, "learning_rate": 4.825e-06, "loss": 0.0106, "num_tokens": 302012.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 17.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.21816301345825195, "kl": 0.020018688403069973, "learning_rate": 4.83e-06, "loss": 0.001, "num_tokens": 302297.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 17.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 5.653951644897461, "kl": 0.1280926689505577, "learning_rate": 4.835e-06, "loss": 0.2288, "num_tokens": 302598.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 17.944444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 6.101024627685547, "kl": 0.06004241667687893, "learning_rate": 4.84e-06, "loss": 0.0162, "num_tokens": 302906.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 17.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.19164133071899414, "kl": 0.010273074731230736, "learning_rate": 4.845e-06, "loss": 0.0005, "num_tokens": 303177.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 17.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.9586422443389893, "kl": 0.227063849568367, "learning_rate": 4.85e-06, "loss": 0.0276, "num_tokens": 303471.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 18.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03807785362005234, "kl": 0.0031582117080688477, "learning_rate": 4.855e-06, "loss": 0.0002, "num_tokens": 303731.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 18.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 2.1092758178710938, "kl": 0.25289690867066383, "learning_rate": 4.86e-06, "loss": 0.0125, "num_tokens": 303989.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 18.037037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 1.118734359741211, "kl": 0.020393870770931244, "learning_rate": 4.865e-06, "loss": -0.0103, "num_tokens": 304439.0, "reward": 2.4000000953674316, "reward_std": 0.3999999463558197, "rewards/reward_combined/mean": 2.4000000953674316, "rewards/reward_combined/std": 0.4000000059604645, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 18.055555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 6.3465189933776855, "kl": 0.12388385832309723, "learning_rate": 4.87e-06, "loss": 0.0414, "num_tokens": 304710.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 18.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.05079726129770279, "kl": 0.0016543567762710154, "learning_rate": 4.875e-06, "loss": 0.0001, "num_tokens": 304966.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 18.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.009937197901308537, "kl": 0.002740219235420227, "learning_rate": 4.880000000000001e-06, "loss": 0.0001, "num_tokens": 305178.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 18.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.15877027809619904, "kl": 0.03011676762253046, "learning_rate": 4.885000000000001e-06, "loss": 0.0015, "num_tokens": 305506.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 18.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.6659550666809082, "kl": 0.019829501397907734, "learning_rate": 4.890000000000001e-06, "loss": 0.001, "num_tokens": 305766.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 18.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.15793824195861816, "kl": 0.019602757645770907, "learning_rate": 4.8950000000000006e-06, "loss": 0.0011, "num_tokens": 305979.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 18.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.00022987854026723653, "kl": 1.879781484603882e-05, "learning_rate": 4.9000000000000005e-06, "loss": 0.0, "num_tokens": 306199.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 18.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.02219484932720661, "kl": 0.0019876256119459867, "learning_rate": 4.9050000000000005e-06, "loss": 0.0001, "num_tokens": 306473.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 18.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.09062913805246353, "kl": 0.024223879911005497, "learning_rate": 4.9100000000000004e-06, "loss": 0.0012, "num_tokens": 306785.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 18.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.22346359491348267, "kl": 0.03548317006789148, "learning_rate": 4.915e-06, "loss": 0.0021, "num_tokens": 307112.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 18.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.869357109069824, "kl": 0.004943526349961758, "learning_rate": 4.92e-06, "loss": 0.1971, "num_tokens": 307449.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 18.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0909104123711586, "kl": 0.011043385835364461, "learning_rate": 4.925e-06, "loss": 0.0006, "num_tokens": 307753.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 18.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.6298279762268066, "kl": 0.0052692034223582596, "learning_rate": 4.93e-06, "loss": 0.3617, "num_tokens": 308149.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_combined/mean": 0.875, "rewards/reward_combined/std": 0.25, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 18.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 5.775054931640625, "kl": 0.15209099650382996, "learning_rate": 4.935e-06, "loss": 0.0781, "num_tokens": 308453.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 18.314814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 4.857112407684326, "kl": 0.1001400463283062, "learning_rate": 4.94e-06, "loss": 0.0546, "num_tokens": 308803.0, "reward": 2.75, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.190238118171692, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 18.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.017026079818606377, "kl": 0.019935433752834797, "learning_rate": 4.945e-06, "loss": 0.001, "num_tokens": 309075.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 18.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.34564006328582764, "kl": 0.08192902058362961, "learning_rate": 4.95e-06, "loss": 0.0041, "num_tokens": 309368.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 18.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11810842901468277, "kl": 0.016174776945263147, "learning_rate": 4.955e-06, "loss": 0.0008, "num_tokens": 309703.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.011111111380159855, "clip_ratio/high_mean": 0.011111111380159855, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 18.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.7717390060424805, "kl": 0.08690868876874447, "learning_rate": 4.960000000000001e-06, "loss": -0.0247, "num_tokens": 310012.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 18.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.03679986298084259, "kl": 0.0031516338931396604, "learning_rate": 4.965000000000001e-06, "loss": 0.0002, "num_tokens": 310248.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 18.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05102856084704399, "kl": 0.02146243304014206, "learning_rate": 4.970000000000001e-06, "loss": 0.0011, "num_tokens": 310572.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 18.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.028189506381750107, "kl": 0.0005374252796173096, "learning_rate": 4.975000000000001e-06, "loss": 0.0, "num_tokens": 310784.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 18.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.026373879984021187, "kl": 0.008581475354731083, "learning_rate": 4.980000000000001e-06, "loss": 0.0004, "num_tokens": 311089.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.0, "completions/clipped_ratio": 0.25, "completions/max_length": 158.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 18.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.630964517593384, "kl": 0.04553654044866562, "learning_rate": 4.9850000000000006e-06, "loss": 0.022, "num_tokens": 311565.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 998 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.01315789483487606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01315789483487606, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 18.5, "frac_reward_zero_std": 0.0, "grad_norm": 5.175231456756592, "kl": 0.19063086062669754, "learning_rate": 4.9900000000000005e-06, "loss": -0.0594, "num_tokens": 311882.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 18.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 8.169957160949707, "kl": 0.11680833250284195, "learning_rate": 4.9950000000000005e-06, "loss": 0.0548, "num_tokens": 312181.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 18.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.035914286971092224, "kl": 0.0015497945714741945, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 312443.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 18.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.06538234651088715, "kl": 0.023168988525867462, "learning_rate": 4.999444444444445e-06, "loss": 0.0012, "num_tokens": 312731.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 18.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 1.289617896080017, "kl": 0.1250031739473343, "learning_rate": 4.998888888888889e-06, "loss": 0.0063, "num_tokens": 312947.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 18.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.36779671907424927, "kl": 0.02395724132657051, "learning_rate": 4.998333333333334e-06, "loss": 0.0011, "num_tokens": 313221.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 18.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.14291010797023773, "kl": 0.04195016250014305, "learning_rate": 4.997777777777778e-06, "loss": 0.0021, "num_tokens": 313489.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 18.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0652097761631012, "kl": 0.003066960023716092, "learning_rate": 4.997222222222223e-06, "loss": 0.0002, "num_tokens": 313749.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 18.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.005609285086393356, "kl": 0.0003465960326138884, "learning_rate": 4.9966666666666665e-06, "loss": 0.0, "num_tokens": 314021.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 18.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.20291046798229218, "kl": 0.03553644847124815, "learning_rate": 4.996111111111112e-06, "loss": 0.0017, "num_tokens": 314312.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 18.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.002160436473786831, "kl": 0.09073524922132492, "learning_rate": 4.995555555555556e-06, "loss": 0.0045, "num_tokens": 314676.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 18.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 4.972053050994873, "kl": 0.17649994790554047, "learning_rate": 4.9950000000000005e-06, "loss": 0.2139, "num_tokens": 315031.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 18.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.007617291994392872, "kl": 0.003298647701740265, "learning_rate": 4.994444444444445e-06, "loss": 0.0002, "num_tokens": 315291.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 18.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07212667167186737, "kl": 0.011507907882332802, "learning_rate": 4.993888888888889e-06, "loss": 0.0006, "num_tokens": 315569.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 18.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.11456499993801117, "kl": 0.13928480446338654, "learning_rate": 4.9933333333333335e-06, "loss": 0.007, "num_tokens": 315876.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 18.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.3114992380142212, "kl": 0.0342629412189126, "learning_rate": 4.992777777777778e-06, "loss": 0.0025, "num_tokens": 316134.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 18.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.009369272738695145, "kl": 0.0009809261100599542, "learning_rate": 4.992222222222223e-06, "loss": 0.0, "num_tokens": 316445.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 18.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.946327805519104, "kl": 0.2524391934275627, "learning_rate": 4.991666666666667e-06, "loss": 0.0118, "num_tokens": 316770.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 18.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.01573961041867733, "kl": 0.0010706241009756923, "learning_rate": 4.991111111111112e-06, "loss": 0.0001, "num_tokens": 317066.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 18.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.2555785179138184, "kl": 0.04656665958464146, "learning_rate": 4.990555555555555e-06, "loss": -0.0196, "num_tokens": 317435.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 18.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 10.31079387664795, "kl": 0.05891003645956516, "learning_rate": 4.9900000000000005e-06, "loss": 0.1038, "num_tokens": 317736.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 18.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.00773577019572258, "kl": 0.0019854247802868485, "learning_rate": 4.989444444444445e-06, "loss": 0.0001, "num_tokens": 317956.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 18.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.1388811320066452, "kl": 0.012818495277315378, "learning_rate": 4.988888888888889e-06, "loss": 0.0006, "num_tokens": 318228.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 18.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.023495756089687347, "kl": 0.006376777775585651, "learning_rate": 4.9883333333333336e-06, "loss": 0.0003, "num_tokens": 318512.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 18.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.16481538116931915, "kl": 0.013347809202969074, "learning_rate": 4.987777777777778e-06, "loss": 0.0007, "num_tokens": 318754.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 18.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 5.297307014465332, "kl": 0.07211543247103691, "learning_rate": 4.987222222222223e-06, "loss": 0.1208, "num_tokens": 319082.0, "reward": 5.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.674234628677368, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 18.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 9.243149757385254, "kl": 0.0303302314132452, "learning_rate": 4.986666666666667e-06, "loss": 0.0544, "num_tokens": 319319.0, "reward": 3.375, "reward_std": 1.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 1.25, "step": 1025 }, { "clip_ratio/high_max": 0.011111111380159855, "clip_ratio/high_mean": 0.011111111380159855, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 19.0, "frac_reward_zero_std": 0.0, "grad_norm": 5.111617088317871, "kl": 0.06491522863507271, "learning_rate": 4.986111111111112e-06, "loss": 0.0277, "num_tokens": 319650.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 19.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.019914759323000908, "kl": 0.0013388039951678365, "learning_rate": 4.985555555555555e-06, "loss": 0.0001, "num_tokens": 319908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.29827284812927246, "kl": 0.04330900311470032, "learning_rate": 4.9850000000000006e-06, "loss": 0.0022, "num_tokens": 320206.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 19.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.17163234949111938, "kl": 0.038485679775476456, "learning_rate": 4.984444444444445e-06, "loss": 0.0019, "num_tokens": 320474.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 19.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.02495182305574417, "kl": 0.0007930733263492584, "learning_rate": 4.983888888888889e-06, "loss": 0.0, "num_tokens": 320786.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 19.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.7692543864250183, "kl": 0.14451509714126587, "learning_rate": 4.983333333333334e-06, "loss": 0.0066, "num_tokens": 321052.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 19.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026595506351441145, "kl": 0.09061753004789352, "learning_rate": 4.982777777777778e-06, "loss": 0.0045, "num_tokens": 321416.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 19.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.829308986663818, "kl": 0.00965787610039115, "learning_rate": 4.982222222222222e-06, "loss": 0.0036, "num_tokens": 321708.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 19.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.3320717811584473, "kl": 0.03490889072418213, "learning_rate": 4.981666666666667e-06, "loss": 0.0219, "num_tokens": 322026.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.00951455906033516, "kl": 0.019267345778644085, "learning_rate": 4.981111111111112e-06, "loss": 0.001, "num_tokens": 322298.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 19.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.02855200320482254, "kl": 0.0015878296690061688, "learning_rate": 4.980555555555555e-06, "loss": 0.0001, "num_tokens": 322558.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 19.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 3.0160539150238037, "kl": 0.17905644327402115, "learning_rate": 4.980000000000001e-06, "loss": 0.1696, "num_tokens": 322897.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 19.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.07231759279966354, "kl": 0.028296099975705147, "learning_rate": 4.979444444444445e-06, "loss": 0.0014, "num_tokens": 323240.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 19.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005990032106637955, "kl": 0.0004362844629213214, "learning_rate": 4.978888888888889e-06, "loss": 0.0, "num_tokens": 323552.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014002079842612147, "kl": 8.158385753631592e-06, "learning_rate": 4.978333333333334e-06, "loss": 0.0, "num_tokens": 323772.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.015529859811067581, "kl": 0.0005582794547080994, "learning_rate": 4.977777777777778e-06, "loss": 0.0, "num_tokens": 323984.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 19.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 5.643756866455078, "kl": 0.01535914558917284, "learning_rate": 4.977222222222222e-06, "loss": -0.0296, "num_tokens": 324229.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 19.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.001985270297154784, "kl": 0.00015801439440110698, "learning_rate": 4.976666666666667e-06, "loss": 0.0, "num_tokens": 324501.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.28642141819000244, "kl": 0.026152964681386948, "learning_rate": 4.976111111111112e-06, "loss": 0.0021, "num_tokens": 324749.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 19.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.2595086991786957, "kl": 0.059815628454089165, "learning_rate": 4.9755555555555554e-06, "loss": 0.003, "num_tokens": 325045.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 19.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.17034408450126648, "kl": 0.08665663003921509, "learning_rate": 4.975000000000001e-06, "loss": 0.0043, "num_tokens": 325353.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 19.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 1.0950713157653809, "kl": 0.09302739799022675, "learning_rate": 4.974444444444445e-06, "loss": 0.005, "num_tokens": 325678.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 19.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 4.023906707763672, "kl": 0.18036355823278427, "learning_rate": 4.973888888888889e-06, "loss": 0.0102, "num_tokens": 326024.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 19.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.008403944782912731, "kl": 0.0017249762895517051, "learning_rate": 4.973333333333334e-06, "loss": 0.0001, "num_tokens": 326244.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01315789483487606, "clip_ratio/low_min": 0.01315789483487606, "clip_ratio/region_mean": 0.01315789483487606, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 19.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 3.4083523750305176, "kl": 0.13405900821089745, "learning_rate": 4.972777777777778e-06, "loss": 0.0609, "num_tokens": 326553.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 19.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027697491459548473, "kl": 0.018207374028861523, "learning_rate": 4.9722222222222224e-06, "loss": 0.0009, "num_tokens": 326813.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 19.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.7913763523101807, "kl": 0.16193245351314545, "learning_rate": 4.971666666666667e-06, "loss": 0.0081, "num_tokens": 327031.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1052 }, { "clip_ratio/high_max": 0.017241379246115685, "clip_ratio/high_mean": 0.017241379246115685, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017241379246115685, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 19.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.637304425239563, "kl": 0.014476389857009053, "learning_rate": 4.971111111111111e-06, "loss": 0.0008, "num_tokens": 327320.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 19.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.011765574105083942, "kl": 0.0008499188697896898, "learning_rate": 4.9705555555555555e-06, "loss": 0.0, "num_tokens": 327616.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 19.537037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 1.6323411464691162, "kl": 0.04703832045197487, "learning_rate": 4.970000000000001e-06, "loss": 0.0278, "num_tokens": 327954.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 19.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.046459030359983444, "kl": 0.0038498379290103912, "learning_rate": 4.969444444444445e-06, "loss": 0.0002, "num_tokens": 328214.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 19.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.023347364738583565, "kl": 0.0035599364200606942, "learning_rate": 4.968888888888889e-06, "loss": 0.0002, "num_tokens": 328449.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 19.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 1.2462600469589233, "kl": 0.19178365916013718, "learning_rate": 4.968333333333334e-06, "loss": 0.0092, "num_tokens": 328763.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 19.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.2487753927707672, "kl": 0.01910197443794459, "learning_rate": 4.967777777777778e-06, "loss": 0.001, "num_tokens": 329043.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.548255205154419, "kl": 0.1681913062930107, "learning_rate": 4.9672222222222225e-06, "loss": 0.0154, "num_tokens": 329368.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 19.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.16127163171768188, "kl": 0.03289625234901905, "learning_rate": 4.966666666666667e-06, "loss": 0.0017, "num_tokens": 329666.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 3.924082040786743, "kl": 0.05322139197960496, "learning_rate": 4.966111111111111e-06, "loss": 0.0176, "num_tokens": 329975.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 19.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.02751401625573635, "kl": 0.007852447219192982, "learning_rate": 4.9655555555555555e-06, "loss": 0.0004, "num_tokens": 330247.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 19.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.04211265966296196, "kl": 0.0007727682241238654, "learning_rate": 4.965000000000001e-06, "loss": 0.0, "num_tokens": 330503.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 19.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.3717548847198486, "kl": 0.010665492620319128, "learning_rate": 4.964444444444445e-06, "loss": 0.07, "num_tokens": 330794.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 19.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.015352129936218262, "kl": 0.0011142075527459383, "learning_rate": 4.9638888888888895e-06, "loss": 0.0001, "num_tokens": 331054.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.011904762126505375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011904762126505375, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.598543167114258, "kl": 0.14879655092954636, "learning_rate": 4.963333333333334e-06, "loss": -0.0255, "num_tokens": 331384.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 19.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 5.684422492980957, "kl": 0.09897159226238728, "learning_rate": 4.962777777777778e-06, "loss": -0.0063, "num_tokens": 331657.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 19.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0721936821937561, "kl": 0.0073605983052402735, "learning_rate": 4.9622222222222225e-06, "loss": 0.0004, "num_tokens": 331938.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 19.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.18793879449367523, "kl": 0.05542546883225441, "learning_rate": 4.961666666666667e-06, "loss": 0.0027, "num_tokens": 332279.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.09136975556612015, "kl": 0.007158362306654453, "learning_rate": 4.961111111111111e-06, "loss": 0.0004, "num_tokens": 332588.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 19.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.7718629837036133, "kl": 0.05134452320635319, "learning_rate": 4.9605555555555564e-06, "loss": 0.0033, "num_tokens": 333038.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 19.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06397955119609833, "kl": 0.0015552788972854614, "learning_rate": 4.960000000000001e-06, "loss": 0.0001, "num_tokens": 333244.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 19.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.45050206780433655, "kl": 0.04385194182395935, "learning_rate": 4.959444444444445e-06, "loss": 0.0022, "num_tokens": 333456.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 19.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 7.429428577423096, "kl": 0.08960303664207458, "learning_rate": 4.9588888888888895e-06, "loss": 0.0474, "num_tokens": 333756.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 19.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.326880693435669, "kl": 0.01371912844479084, "learning_rate": 4.958333333333334e-06, "loss": -0.0239, "num_tokens": 334094.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 19.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.046631716191768646, "kl": 0.005625084915664047, "learning_rate": 4.957777777777778e-06, "loss": 0.0003, "num_tokens": 334373.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 19.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.850725173950195, "kl": 0.03989560902118683, "learning_rate": 4.9572222222222226e-06, "loss": -0.1656, "num_tokens": 334756.0, "reward": 4.125, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.308422088623047, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 19.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.298603534698486, "kl": 0.04090710170567036, "learning_rate": 4.956666666666667e-06, "loss": 0.0035, "num_tokens": 335093.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 20.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.33509376645088196, "kl": 0.12259339541196823, "learning_rate": 4.956111111111111e-06, "loss": 0.0067, "num_tokens": 335359.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 20.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.3824381828308105, "kl": 0.08046378381550312, "learning_rate": 4.9555555555555565e-06, "loss": 0.0887, "num_tokens": 335728.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.08070409297943115, "kl": 0.008534100372344255, "learning_rate": 4.955e-06, "loss": 0.0004, "num_tokens": 336010.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 20.055555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 10.789376258850098, "kl": 0.11606713570654392, "learning_rate": 4.954444444444445e-06, "loss": 0.1136, "num_tokens": 336289.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 20.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.32052522897720337, "kl": 0.0944342128932476, "learning_rate": 4.9538888888888896e-06, "loss": 0.0047, "num_tokens": 336595.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 20.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026931907050311565, "kl": 0.018229997716844082, "learning_rate": 4.953333333333334e-06, "loss": 0.0009, "num_tokens": 336855.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.5638939142227173, "kl": 0.07382907066494226, "learning_rate": 4.952777777777778e-06, "loss": 0.0031, "num_tokens": 337145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 20.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.801694393157959, "kl": 0.11115998029708862, "learning_rate": 4.952222222222223e-06, "loss": 0.1042, "num_tokens": 337459.0, "reward": 4.0, "reward_std": 4.527692794799805, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.527692794799805, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 20.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.9187347888946533, "kl": 0.000533264857949689, "learning_rate": 4.951666666666667e-06, "loss": -0.003, "num_tokens": 337715.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 20.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.03167855367064476, "kl": 0.0019123487873002887, "learning_rate": 4.951111111111111e-06, "loss": 0.0001, "num_tokens": 337975.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 20.185185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 6.577125072479248, "kl": 0.12872080504894257, "learning_rate": 4.9505555555555565e-06, "loss": 0.0633, "num_tokens": 338323.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 20.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 4.030978202819824, "kl": 0.25382937490940094, "learning_rate": 4.95e-06, "loss": -0.128, "num_tokens": 338660.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 20.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.20953090488910675, "kl": 0.01888192445039749, "learning_rate": 4.949444444444445e-06, "loss": 0.0014, "num_tokens": 338904.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.37362751364707947, "kl": 0.1546972543001175, "learning_rate": 4.94888888888889e-06, "loss": 0.008, "num_tokens": 339237.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.060383811593055725, "kl": 0.10714878886938095, "learning_rate": 4.948333333333334e-06, "loss": 0.0052, "num_tokens": 339554.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.7018697261810303, "kl": 0.11542755737900734, "learning_rate": 4.947777777777778e-06, "loss": 0.0055, "num_tokens": 339850.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 20.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.009076755493879318, "kl": 0.004525758326053619, "learning_rate": 4.947222222222223e-06, "loss": 0.0002, "num_tokens": 340094.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 20.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.07230979949235916, "kl": 0.006110168760642409, "learning_rate": 4.946666666666667e-06, "loss": 0.0003, "num_tokens": 340418.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.13752692937850952, "kl": 0.012539931572973728, "learning_rate": 4.946111111111111e-06, "loss": 0.0006, "num_tokens": 340689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 20.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03708859533071518, "kl": 0.0013888776302337646, "learning_rate": 4.945555555555557e-06, "loss": 0.0001, "num_tokens": 340897.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 20.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07807718217372894, "kl": 0.0035019456408917904, "learning_rate": 4.945e-06, "loss": 0.0002, "num_tokens": 341116.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 20.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 7.427909851074219, "kl": 0.6800234615802765, "learning_rate": 4.944444444444445e-06, "loss": 0.0319, "num_tokens": 341409.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 20.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.3330053985118866, "kl": 0.028791368764359504, "learning_rate": 4.943888888888889e-06, "loss": 0.0014, "num_tokens": 341669.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 20.425925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 4.22399377822876, "kl": 0.19743453711271286, "learning_rate": 4.943333333333334e-06, "loss": -0.0034, "num_tokens": 341978.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 20.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.13458384573459625, "kl": 0.0856349803507328, "learning_rate": 4.942777777777778e-06, "loss": 0.0039, "num_tokens": 342304.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 20.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 5.3271050453186035, "kl": 0.008626417722553015, "learning_rate": 4.942222222222223e-06, "loss": 0.0453, "num_tokens": 342570.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 20.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.10012917220592499, "kl": 0.0679309219121933, "learning_rate": 4.941666666666667e-06, "loss": 0.0035, "num_tokens": 342842.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.08136898279190063, "kl": 0.005604652455076575, "learning_rate": 4.9411111111111114e-06, "loss": 0.0003, "num_tokens": 343140.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 20.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 17.34052085876465, "kl": 0.007978130131959915, "learning_rate": 4.940555555555557e-06, "loss": 0.2911, "num_tokens": 343358.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671800494194, "kl": 0.021556190215051174, "learning_rate": 4.94e-06, "loss": 0.0011, "num_tokens": 343666.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 20.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029747583903372288, "kl": 0.09054818004369736, "learning_rate": 4.939444444444445e-06, "loss": 0.0045, "num_tokens": 344030.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 20.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 1.77183198928833, "kl": 0.09302114136517048, "learning_rate": 4.938888888888889e-06, "loss": 0.007, "num_tokens": 344317.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.03585350513458252, "kl": 0.0055355902295559645, "learning_rate": 4.938333333333334e-06, "loss": 0.0003, "num_tokens": 344590.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 20.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04017645865678787, "kl": 0.020007004961371422, "learning_rate": 4.937777777777778e-06, "loss": 0.001, "num_tokens": 344878.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01315789483487606, "clip_ratio/low_min": 0.01315789483487606, "clip_ratio/region_mean": 0.01315789483487606, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 20.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.326385021209717, "kl": 0.07349661365151405, "learning_rate": 4.937222222222223e-06, "loss": -0.0266, "num_tokens": 345210.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 20.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.9453924298286438, "kl": 0.08318531513214111, "learning_rate": 4.936666666666667e-06, "loss": 0.0048, "num_tokens": 345429.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 20.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014421507949009538, "kl": 6.59748911857605e-05, "learning_rate": 4.9361111111111115e-06, "loss": 0.0, "num_tokens": 345649.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 20.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.10473082959651947, "kl": 0.018203971907496452, "learning_rate": 4.935555555555556e-06, "loss": 0.0009, "num_tokens": 345950.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 20.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.14779256284236908, "kl": 0.0390373095870018, "learning_rate": 4.935e-06, "loss": 0.002, "num_tokens": 346218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 20.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.03684819117188454, "kl": 0.004257744061760604, "learning_rate": 4.934444444444445e-06, "loss": 0.0002, "num_tokens": 346453.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 20.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02492555044591427, "kl": 0.0013734165695495903, "learning_rate": 4.933888888888889e-06, "loss": 0.0001, "num_tokens": 346711.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 20.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0980101153254509, "kl": 0.007050741463899612, "learning_rate": 4.933333333333334e-06, "loss": 0.0004, "num_tokens": 347017.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 20.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.1562749147415161, "kl": 0.033647555857896805, "learning_rate": 4.932777777777778e-06, "loss": 0.0017, "num_tokens": 347348.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 20.796296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 2.308817148208618, "kl": 0.00440683588385582, "learning_rate": 4.932222222222223e-06, "loss": 0.1814, "num_tokens": 347680.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 20.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.12872835993766785, "kl": 0.014510346110910177, "learning_rate": 4.931666666666667e-06, "loss": 0.0007, "num_tokens": 348013.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 20.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.24307742714881897, "kl": 0.08615827187895775, "learning_rate": 4.9311111111111115e-06, "loss": 0.0041, "num_tokens": 348457.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 20.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05965739116072655, "kl": 0.0042120045982301235, "learning_rate": 4.930555555555556e-06, "loss": 0.0002, "num_tokens": 348721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 20.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06732179969549179, "kl": 0.005646795034408569, "learning_rate": 4.93e-06, "loss": 0.0003, "num_tokens": 348981.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 20.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02519744075834751, "kl": 0.00043748319149017334, "learning_rate": 4.9294444444444454e-06, "loss": 0.0, "num_tokens": 349193.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 20.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.014725363813340664, "kl": 0.017609018832445145, "learning_rate": 4.928888888888889e-06, "loss": 0.0009, "num_tokens": 349509.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 20.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.041261255741119385, "kl": 0.02831559907644987, "learning_rate": 4.928333333333334e-06, "loss": 0.0014, "num_tokens": 349795.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 20.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.1884632408618927, "kl": 0.027718988247215748, "learning_rate": 4.927777777777778e-06, "loss": 0.0014, "num_tokens": 350071.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 20.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.9383654594421387, "kl": 0.05770757794380188, "learning_rate": 4.927222222222223e-06, "loss": -0.051, "num_tokens": 350445.0, "reward": 2.125, "reward_std": 2.428133726119995, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 2.428133726119995, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 20.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.050584424287080765, "kl": 0.01652698963880539, "learning_rate": 4.926666666666667e-06, "loss": 0.0008, "num_tokens": 350757.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 21.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.2519683837890625, "kl": 0.04469682276248932, "learning_rate": 4.9261111111111116e-06, "loss": 0.0022, "num_tokens": 351049.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 21.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 6.6765546798706055, "kl": 0.05569342523813248, "learning_rate": 4.925555555555556e-06, "loss": -0.0622, "num_tokens": 351350.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 21.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.7561283707618713, "kl": 0.09551490470767021, "learning_rate": 4.925e-06, "loss": 0.0051, "num_tokens": 351639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 21.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.07074621319770813, "kl": 0.01075678039342165, "learning_rate": 4.924444444444445e-06, "loss": 0.0005, "num_tokens": 351937.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 21.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.07590477168560028, "kl": 0.005401508184149861, "learning_rate": 4.923888888888889e-06, "loss": 0.0003, "num_tokens": 352243.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 21.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 1.800001859664917, "kl": 0.11076255142688751, "learning_rate": 4.923333333333334e-06, "loss": 0.0082, "num_tokens": 352459.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 21.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.010570141486823559, "kl": 0.004061475396156311, "learning_rate": 4.922777777777778e-06, "loss": 0.0002, "num_tokens": 352703.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 21.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035929863806813955, "kl": 0.018013957887887955, "learning_rate": 4.922222222222223e-06, "loss": 0.0009, "num_tokens": 352963.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 21.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 6.119032859802246, "kl": 0.005681137088686228, "learning_rate": 4.921666666666666e-06, "loss": -0.0797, "num_tokens": 353239.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 21.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.06629530340433121, "kl": 0.06451274827122688, "learning_rate": 4.921111111111112e-06, "loss": 0.0032, "num_tokens": 353535.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 21.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.10412687808275223, "kl": 0.0054829916916787624, "learning_rate": 4.920555555555556e-06, "loss": 0.0003, "num_tokens": 353745.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 21.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 5.207913875579834, "kl": 0.10544314235448837, "learning_rate": 4.92e-06, "loss": 0.0043, "num_tokens": 354044.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 21.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.671443462371826, "kl": 0.05727195367217064, "learning_rate": 4.919444444444445e-06, "loss": -0.0035, "num_tokens": 354308.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 21.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.08752396702766418, "kl": 0.01681679580360651, "learning_rate": 4.918888888888889e-06, "loss": 0.0008, "num_tokens": 354612.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 21.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.006568490993231535, "kl": 0.0015992402913980186, "learning_rate": 4.918333333333334e-06, "loss": 0.0001, "num_tokens": 354832.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 21.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.06424691528081894, "kl": 0.0048697046004235744, "learning_rate": 4.917777777777778e-06, "loss": 0.0002, "num_tokens": 355154.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 21.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.07289162278175354, "kl": 0.027500695548951626, "learning_rate": 4.917222222222223e-06, "loss": 0.0013, "num_tokens": 355446.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.09166938811540604, "kl": 0.01696994062513113, "learning_rate": 4.9166666666666665e-06, "loss": 0.0009, "num_tokens": 355720.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 21.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.1819019317626953, "kl": 0.022678513079881668, "learning_rate": 4.916111111111112e-06, "loss": 0.0011, "num_tokens": 355990.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 21.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.051520101726055145, "kl": 0.0040514456341043115, "learning_rate": 4.915555555555556e-06, "loss": 0.0002, "num_tokens": 356225.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 21.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02957177720963955, "kl": 0.023020675871521235, "learning_rate": 4.915e-06, "loss": 0.0012, "num_tokens": 356513.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 21.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.785685062408447, "kl": 0.05546746030449867, "learning_rate": 4.914444444444445e-06, "loss": 0.1092, "num_tokens": 356863.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 21.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.07334727793931961, "kl": 0.02897409349679947, "learning_rate": 4.913888888888889e-06, "loss": 0.0015, "num_tokens": 357197.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.75, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 21.425925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.4743679761886597, "kl": 0.06898967176675797, "learning_rate": 4.9133333333333334e-06, "loss": 0.0405, "num_tokens": 357656.0, "reward": 1.7999999523162842, "reward_std": 2.3999998569488525, "rewards/reward_combined/mean": 1.7999999523162842, "rewards/reward_combined/std": 2.3999998569488525, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 21.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 6.279052734375, "kl": 0.11995658744126558, "learning_rate": 4.912777777777778e-06, "loss": 0.101, "num_tokens": 357983.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 21.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.22562088072299957, "kl": 0.03192038310226053, "learning_rate": 4.912222222222223e-06, "loss": 0.0012, "num_tokens": 358255.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.012820512987673283, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 76.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 76.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 21.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.2678613662719727, "kl": 0.0451419223099947, "learning_rate": 4.9116666666666665e-06, "loss": 0.4568, "num_tokens": 358778.0, "reward": 5.050000190734863, "reward_std": 5.899999618530273, "rewards/reward_combined/mean": 5.050000190734863, "rewards/reward_combined/std": 5.90000057220459, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 21.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.635470867156982, "kl": 0.07659619115293026, "learning_rate": 4.911111111111112e-06, "loss": 0.0688, "num_tokens": 359098.0, "reward": 1.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 21.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08746584504842758, "kl": 0.0074495249427855015, "learning_rate": 4.910555555555556e-06, "loss": 0.0004, "num_tokens": 359365.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.14773081243038177, "kl": 0.02291015489026904, "learning_rate": 4.9100000000000004e-06, "loss": 0.0014, "num_tokens": 359637.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 21.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.09686380624771118, "kl": 0.010608309414237738, "learning_rate": 4.909444444444445e-06, "loss": 0.0005, "num_tokens": 359903.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 21.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06692738085985184, "kl": 0.022903108969330788, "learning_rate": 4.908888888888889e-06, "loss": 0.0011, "num_tokens": 360227.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.07365915179252625, "kl": 0.006969837471842766, "learning_rate": 4.9083333333333335e-06, "loss": 0.0003, "num_tokens": 360515.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 21.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.779480457305908, "kl": 0.1149221658706665, "learning_rate": 4.907777777777778e-06, "loss": -0.043, "num_tokens": 360839.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 21.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.376820087432861, "kl": 0.0582354674115777, "learning_rate": 4.907222222222223e-06, "loss": 0.0562, "num_tokens": 361148.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 21.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04485880210995674, "kl": 0.008734485134482384, "learning_rate": 4.9066666666666666e-06, "loss": 0.0004, "num_tokens": 361404.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 21.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 2.1887004375457764, "kl": 0.010484515223652124, "learning_rate": 4.906111111111112e-06, "loss": 0.0188, "num_tokens": 361718.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 21.685185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 10.541540145874023, "kl": 0.06382082309573889, "learning_rate": 4.905555555555556e-06, "loss": 0.2998, "num_tokens": 361955.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 21.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 2.584369659423828, "kl": 0.03026885073632002, "learning_rate": 4.9050000000000005e-06, "loss": -0.2402, "num_tokens": 362265.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 1172 }, { "clip_ratio/high_max": 0.010204081423580647, "clip_ratio/high_mean": 0.010204081423580647, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010204081423580647, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 21.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.247589588165283, "kl": 0.07631747052073479, "learning_rate": 4.904444444444445e-06, "loss": -0.0217, "num_tokens": 362609.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 21.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.13050247728824615, "kl": 0.006852202117443085, "learning_rate": 4.903888888888889e-06, "loss": 0.0003, "num_tokens": 362829.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 21.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.740248680114746, "kl": 0.03979816287755966, "learning_rate": 4.9033333333333335e-06, "loss": -0.0217, "num_tokens": 363198.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 7.865298748016357, "kl": 0.06548908725380898, "learning_rate": 4.902777777777778e-06, "loss": 0.0777, "num_tokens": 363475.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 21.796296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 3.132539749145508, "kl": 0.05221923440694809, "learning_rate": 4.902222222222222e-06, "loss": -0.0017, "num_tokens": 363779.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 21.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.014535834081470966, "kl": 0.00343449495267123, "learning_rate": 4.901666666666667e-06, "loss": 0.0002, "num_tokens": 364065.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 21.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.05909493565559387, "kl": 0.01025629648938775, "learning_rate": 4.901111111111112e-06, "loss": 0.0005, "num_tokens": 364396.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1179 }, { "clip_ratio/high_max": 0.007462686393409967, "clip_ratio/high_mean": 0.007462686393409967, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007462686393409967, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 21.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.124629497528076, "kl": 0.17589575052261353, "learning_rate": 4.900555555555556e-06, "loss": -0.0917, "num_tokens": 364758.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 21.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045346482656896114, "kl": 0.09039538726210594, "learning_rate": 4.9000000000000005e-06, "loss": 0.0045, "num_tokens": 365122.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.20543627440929413, "kl": 0.04643070511519909, "learning_rate": 4.899444444444445e-06, "loss": 0.0023, "num_tokens": 365390.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 21.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.16666434705257416, "kl": 0.01648897770792246, "learning_rate": 4.898888888888889e-06, "loss": 0.0008, "num_tokens": 365704.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 21.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 4.639585018157959, "kl": 0.1505536437034607, "learning_rate": 4.898333333333334e-06, "loss": -0.1406, "num_tokens": 366081.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 21.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.28209301829338074, "kl": 0.03304676711559296, "learning_rate": 4.897777777777778e-06, "loss": 0.0017, "num_tokens": 366295.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 21.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 6.095164775848389, "kl": 0.038580963388085365, "learning_rate": 4.897222222222222e-06, "loss": 0.3317, "num_tokens": 366594.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 21.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.006679816171526909, "kl": 0.0033981555607169867, "learning_rate": 4.896666666666667e-06, "loss": 0.0002, "num_tokens": 366890.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.015384615398943424, "clip_ratio/high_mean": 0.015384615398943424, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015384615398943424, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 22.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.6942877769470215, "kl": 0.11890515685081482, "learning_rate": 4.896111111111112e-06, "loss": 0.0722, "num_tokens": 367276.0, "reward": 1.875, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 1.6007810831069946, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 22.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01694505102932453, "kl": 0.00035868585109710693, "learning_rate": 4.895555555555556e-06, "loss": 0.0, "num_tokens": 367488.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 22.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.055587153881788254, "kl": 0.018433132208883762, "learning_rate": 4.8950000000000006e-06, "loss": 0.0009, "num_tokens": 367777.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 22.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.009831026196479797, "kl": 0.0015212628641165793, "learning_rate": 4.894444444444445e-06, "loss": 0.0001, "num_tokens": 368089.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 22.074074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 13.206089973449707, "kl": 0.08026685193181038, "learning_rate": 4.893888888888889e-06, "loss": 0.3054, "num_tokens": 368322.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 22.09259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 6.059370040893555, "kl": 0.16548923403024673, "learning_rate": 4.893333333333334e-06, "loss": 0.0352, "num_tokens": 368661.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 22.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.008287239819765091, "kl": 0.006901221349835396, "learning_rate": 4.892777777777778e-06, "loss": 0.0003, "num_tokens": 368973.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 22.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007269767113029957, "kl": 0.003182690590620041, "learning_rate": 4.892222222222222e-06, "loss": 0.0002, "num_tokens": 369233.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 22.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.003147072857245803, "kl": 0.0181342801079154, "learning_rate": 4.8916666666666675e-06, "loss": 0.0009, "num_tokens": 369493.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 22.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.16594883799552917, "kl": 0.02289127092808485, "learning_rate": 4.891111111111111e-06, "loss": 0.0011, "num_tokens": 369768.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.185185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 5.102049350738525, "kl": 0.07805067673325539, "learning_rate": 4.890555555555556e-06, "loss": 0.1975, "num_tokens": 370050.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 22.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.3085663318634033, "kl": 0.06572166457772255, "learning_rate": 4.890000000000001e-06, "loss": 0.0034, "num_tokens": 370335.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 22.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.012843611650168896, "kl": 0.003091962542384863, "learning_rate": 4.889444444444445e-06, "loss": 0.0002, "num_tokens": 370570.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 22.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03468000143766403, "kl": 0.010092430748045444, "learning_rate": 4.888888888888889e-06, "loss": 0.0005, "num_tokens": 370878.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 22.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.09464526176452637, "kl": 0.02200379502028227, "learning_rate": 4.888333333333334e-06, "loss": 0.0011, "num_tokens": 371164.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 22.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.013751049526035786, "kl": 0.0002351030707359314, "learning_rate": 4.887777777777778e-06, "loss": 0.0, "num_tokens": 371400.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 22.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 5.15621280670166, "kl": 0.18059402704238892, "learning_rate": 4.887222222222222e-06, "loss": -0.0784, "num_tokens": 371724.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 22.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.11530797928571701, "kl": 0.011308697052299976, "learning_rate": 4.886666666666668e-06, "loss": 0.001, "num_tokens": 371957.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 22.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.1371685415506363, "kl": 0.022691598162055016, "learning_rate": 4.886111111111111e-06, "loss": 0.0011, "num_tokens": 372273.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 22.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.5358145236968994, "kl": 0.17567165195941925, "learning_rate": 4.885555555555556e-06, "loss": 0.0089, "num_tokens": 372601.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 22.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.5925798416137695, "kl": 0.1876046359539032, "learning_rate": 4.885000000000001e-06, "loss": 0.2507, "num_tokens": 372932.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 88.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 88.25, "completions/mean_terminated_length": 32.333335876464844, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 22.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.3190274238586426, "kl": 0.062431614845991135, "learning_rate": 4.884444444444445e-06, "loss": 0.1747, "num_tokens": 373521.0, "reward": 1.2999999523162842, "reward_std": 2.4481287002563477, "rewards/reward_combined/mean": 1.2999999523162842, "rewards/reward_combined/std": 2.4481284618377686, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 22.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012469755893107504, "kl": 6.183981895446777e-06, "learning_rate": 4.883888888888889e-06, "loss": 0.0, "num_tokens": 373741.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 22.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.002993199275806546, "kl": 0.0906413085758686, "learning_rate": 4.883333333333334e-06, "loss": 0.0045, "num_tokens": 374105.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 22.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.03418496251106262, "kl": 0.0017157458933070302, "learning_rate": 4.882777777777778e-06, "loss": 0.0001, "num_tokens": 374424.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 22.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.9086012840270996, "kl": 0.12557106465101242, "learning_rate": 4.8822222222222224e-06, "loss": 0.0727, "num_tokens": 374818.0, "reward": 1.0499999523162842, "reward_std": 4.859012126922607, "rewards/reward_combined/mean": 1.0499999523162842, "rewards/reward_combined/std": 4.859012126922607, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 22.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.6492769718170166, "kl": 0.017156684771180153, "learning_rate": 4.881666666666668e-06, "loss": -0.0287, "num_tokens": 375120.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.843365430831909, "kl": 0.008289291406981647, "learning_rate": 4.881111111111111e-06, "loss": 0.0576, "num_tokens": 375394.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 22.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07524572312831879, "kl": 0.051352959126234055, "learning_rate": 4.880555555555556e-06, "loss": 0.0026, "num_tokens": 375688.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 22.537037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.551126718521118, "kl": 0.025477640330791473, "learning_rate": 4.880000000000001e-06, "loss": -0.0332, "num_tokens": 376019.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 22.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.14398939907550812, "kl": 0.013485996052622795, "learning_rate": 4.879444444444445e-06, "loss": 0.0005, "num_tokens": 376273.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 22.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.2311774641275406, "kl": 0.024828763213008642, "learning_rate": 4.878888888888889e-06, "loss": 0.0012, "num_tokens": 376557.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.1660335659980774, "kl": 0.028198611922562122, "learning_rate": 4.878333333333334e-06, "loss": 0.0014, "num_tokens": 376829.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 22.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.040395550429821014, "kl": 0.009010890033096075, "learning_rate": 4.877777777777778e-06, "loss": 0.0005, "num_tokens": 377162.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 22.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.41686487197876, "kl": 0.05376596376299858, "learning_rate": 4.8772222222222225e-06, "loss": 0.1028, "num_tokens": 377482.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1222 }, { "clip_ratio/high_max": 0.008064515888690948, "clip_ratio/high_mean": 0.008064515888690948, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008064515888690948, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 22.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.158329963684082, "kl": 0.06770491600036621, "learning_rate": 4.876666666666668e-06, "loss": -0.0736, "num_tokens": 377818.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 22.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.1078508049249649, "kl": 0.03290352877229452, "learning_rate": 4.876111111111111e-06, "loss": 0.0016, "num_tokens": 378154.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 22.685185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 6.411366939544678, "kl": 0.053302984684705734, "learning_rate": 4.875555555555556e-06, "loss": 0.2425, "num_tokens": 378495.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 22.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 6.7144975662231445, "kl": 0.14690274745225906, "learning_rate": 4.875e-06, "loss": 0.1281, "num_tokens": 378822.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 22.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.284180611371994, "kl": 0.02506973221898079, "learning_rate": 4.874444444444445e-06, "loss": 0.0013, "num_tokens": 379066.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 22.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.892489492893219, "kl": 0.058013347908854485, "learning_rate": 4.8738888888888895e-06, "loss": 0.0026, "num_tokens": 379322.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0880950540304184, "kl": 0.00881899637170136, "learning_rate": 4.873333333333334e-06, "loss": 0.0004, "num_tokens": 379610.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 22.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0066324640065431595, "kl": 0.002224785159341991, "learning_rate": 4.872777777777778e-06, "loss": 0.0001, "num_tokens": 379830.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.796296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 6.0038628578186035, "kl": 0.14364323392510414, "learning_rate": 4.8722222222222225e-06, "loss": 0.1927, "num_tokens": 380107.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 22.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.23601622879505157, "kl": 0.04062902554869652, "learning_rate": 4.871666666666668e-06, "loss": 0.0021, "num_tokens": 380378.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 22.833333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 4.7099080085754395, "kl": 0.10279777646064758, "learning_rate": 4.871111111111111e-06, "loss": 0.07, "num_tokens": 380727.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 22.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.15196821093559265, "kl": 0.025494626723229885, "learning_rate": 4.8705555555555565e-06, "loss": 0.0013, "num_tokens": 381034.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 22.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.560300827026367, "kl": 0.13921474665403366, "learning_rate": 4.87e-06, "loss": 0.0679, "num_tokens": 381341.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 22.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.9422266483306885, "kl": 0.07285254821181297, "learning_rate": 4.869444444444445e-06, "loss": 0.0021, "num_tokens": 381793.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.7736340761184692, "kl": 0.08779716771095991, "learning_rate": 4.8688888888888895e-06, "loss": 0.0047, "num_tokens": 382091.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 22.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05533405393362045, "kl": 0.0076990071684122086, "learning_rate": 4.868333333333334e-06, "loss": 0.0004, "num_tokens": 382359.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 22.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.6018087863922119, "kl": 0.05531853623688221, "learning_rate": 4.867777777777778e-06, "loss": 0.0028, "num_tokens": 382615.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 22.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.1192927435040474, "kl": 0.020018479321151972, "learning_rate": 4.867222222222223e-06, "loss": 0.001, "num_tokens": 382877.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 22.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.366946220397949, "kl": 0.13897836953401566, "learning_rate": 4.866666666666667e-06, "loss": -0.0195, "num_tokens": 383187.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 23.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.1767386496067047, "kl": 0.003003247082233429, "learning_rate": 4.866111111111111e-06, "loss": 0.0002, "num_tokens": 383397.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 23.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.35805898904800415, "kl": 0.03285284619778395, "learning_rate": 4.8655555555555565e-06, "loss": 0.0016, "num_tokens": 383653.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 23.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.02546204999089241, "kl": 0.030486248433589935, "learning_rate": 4.865e-06, "loss": 0.0015, "num_tokens": 383869.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 23.055555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 3.654215097427368, "kl": 0.04628860764205456, "learning_rate": 4.864444444444445e-06, "loss": 0.0354, "num_tokens": 384174.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 23.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.021433765068650246, "kl": 0.00452861818484962, "learning_rate": 4.863888888888889e-06, "loss": 0.0002, "num_tokens": 384476.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 66.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 66.25, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 23.09259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 3.3981246948242188, "kl": 0.005638260976411402, "learning_rate": 4.863333333333334e-06, "loss": 0.625, "num_tokens": 384945.0, "reward": 1.2999999523162842, "reward_std": 2.785677671432495, "rewards/reward_combined/mean": 1.2999999523162842, "rewards/reward_combined/std": 2.785677671432495, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 23.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.049965083599090576, "kl": 0.010630922857671976, "learning_rate": 4.862777777777778e-06, "loss": 0.0005, "num_tokens": 385278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 23.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.014857045374810696, "kl": 0.0034819766879081726, "learning_rate": 4.862222222222223e-06, "loss": 0.0002, "num_tokens": 385538.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 23.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.570472240447998, "kl": 0.05043673329055309, "learning_rate": 4.861666666666667e-06, "loss": 0.0028, "num_tokens": 385804.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 23.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.09994973987340927, "kl": 0.012881585862487555, "learning_rate": 4.861111111111111e-06, "loss": 0.0005, "num_tokens": 386058.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 23.185185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 3.7367568016052246, "kl": 0.028914999216794968, "learning_rate": 4.8605555555555565e-06, "loss": 0.0328, "num_tokens": 386372.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 23.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.22528225183486938, "kl": 0.01934004109352827, "learning_rate": 4.86e-06, "loss": 0.001, "num_tokens": 386630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 23.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.002420595148578286, "kl": 0.09068793803453445, "learning_rate": 4.859444444444445e-06, "loss": 0.0045, "num_tokens": 386994.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 23.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09847412258386612, "kl": 0.021856773644685745, "learning_rate": 4.858888888888889e-06, "loss": 0.0011, "num_tokens": 387283.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 23.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.32191118597984314, "kl": 0.029192131012678146, "learning_rate": 4.858333333333334e-06, "loss": 0.0015, "num_tokens": 387543.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 23.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 1.0525048971176147, "kl": 0.11644791369326413, "learning_rate": 4.857777777777778e-06, "loss": 0.0063, "num_tokens": 387764.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 23.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.012801372446119785, "kl": 0.0029341578483581543, "learning_rate": 4.857222222222223e-06, "loss": 0.0001, "num_tokens": 388008.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 23.314814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 11.977566719055176, "kl": 0.10901259630918503, "learning_rate": 4.856666666666667e-06, "loss": 0.088, "num_tokens": 388255.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 23.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.03407379984855652, "kl": 0.0011457204818725586, "learning_rate": 4.856111111111111e-06, "loss": 0.0001, "num_tokens": 388467.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07144872844219208, "kl": 0.012043708469718695, "learning_rate": 4.855555555555556e-06, "loss": 0.0006, "num_tokens": 388741.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 23.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.036061421036720276, "kl": 0.050676312297582626, "learning_rate": 4.855e-06, "loss": 0.0025, "num_tokens": 389034.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 23.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0683819055557251, "kl": 0.05150883086025715, "learning_rate": 4.854444444444445e-06, "loss": 0.0026, "num_tokens": 389472.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023555627558380365, "kl": 0.00344929366838187, "learning_rate": 4.853888888888889e-06, "loss": 0.0002, "num_tokens": 389768.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 23.425925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 5.435967922210693, "kl": 0.28992460668087006, "learning_rate": 4.853333333333334e-06, "loss": 0.0611, "num_tokens": 390071.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 23.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.07216019928455353, "kl": 0.0030257850885391235, "learning_rate": 4.8527777777777775e-06, "loss": 0.0001, "num_tokens": 390277.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 23.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.018558533862233162, "kl": 0.00471577956341207, "learning_rate": 4.852222222222223e-06, "loss": 0.0002, "num_tokens": 390561.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 23.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.3733978271484375, "kl": 0.013044813007581979, "learning_rate": 4.851666666666667e-06, "loss": 0.0102, "num_tokens": 390881.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 23.5, "frac_reward_zero_std": 0.0, "grad_norm": 8.406050682067871, "kl": 0.12190075777471066, "learning_rate": 4.8511111111111114e-06, "loss": 0.0397, "num_tokens": 391210.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 23.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072555034421384335, "kl": 0.01721816696226597, "learning_rate": 4.850555555555556e-06, "loss": 0.0009, "num_tokens": 391470.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.537037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 4.365720272064209, "kl": 0.19877377152442932, "learning_rate": 4.85e-06, "loss": 0.0994, "num_tokens": 391751.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.4916924238204956, "kl": 0.0984172485768795, "learning_rate": 4.849444444444445e-06, "loss": 0.0049, "num_tokens": 392019.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 90.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 90.0, "completions/mean_terminated_length": 34.66666793823242, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 23.574074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 1.69490647315979, "kl": 0.11855834908783436, "learning_rate": 4.848888888888889e-06, "loss": 0.473, "num_tokens": 392603.0, "reward": 4.425000190734863, "reward_std": 3.797696828842163, "rewards/reward_combined/mean": 4.425000190734863, "rewards/reward_combined/std": 3.797696828842163, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 23.59259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 3.766730546951294, "kl": 0.057556966319680214, "learning_rate": 4.848333333333334e-06, "loss": -0.1124, "num_tokens": 392931.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 23.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05839148163795471, "kl": 0.026562407612800598, "learning_rate": 4.8477777777777776e-06, "loss": 0.0013, "num_tokens": 393259.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 88.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 88.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 23.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.440394639968872, "kl": 0.04605616070330143, "learning_rate": 4.847222222222223e-06, "loss": 0.3906, "num_tokens": 393847.0, "reward": 2.549999952316284, "reward_std": 1.899999976158142, "rewards/reward_combined/mean": 2.549999952316284, "rewards/reward_combined/std": 1.899999976158142, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 23.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10179184377193451, "kl": 0.019125239923596382, "learning_rate": 4.846666666666667e-06, "loss": 0.001, "num_tokens": 394175.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 23.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 3.303798198699951, "kl": 0.12015142291784286, "learning_rate": 4.8461111111111115e-06, "loss": 0.0551, "num_tokens": 394499.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 23.685185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 2.9197704792022705, "kl": 0.1615377962589264, "learning_rate": 4.845555555555556e-06, "loss": -0.0652, "num_tokens": 394837.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 23.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 2.75447154045105, "kl": 0.024499179795384407, "learning_rate": 4.845e-06, "loss": 0.0656, "num_tokens": 395198.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 23.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.802914619445801, "kl": 0.1282813586294651, "learning_rate": 4.8444444444444446e-06, "loss": 0.0144, "num_tokens": 395483.0, "reward": 3.875, "reward_std": 4.802343368530273, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 4.802343368530273, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 23.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.264158695936203, "kl": 0.1242826022207737, "learning_rate": 4.843888888888889e-06, "loss": 0.0061, "num_tokens": 395783.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 23.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003097474400419742, "kl": 2.2195279598236084e-05, "learning_rate": 4.843333333333334e-06, "loss": 0.0, "num_tokens": 396003.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.11020349711179733, "kl": 0.02094514900818467, "learning_rate": 4.842777777777778e-06, "loss": 0.001, "num_tokens": 396280.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 23.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.3863958716392517, "kl": 0.0553385354578495, "learning_rate": 4.842222222222223e-06, "loss": 0.0028, "num_tokens": 396550.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 23.814814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 4.425080299377441, "kl": 0.1290171891450882, "learning_rate": 4.841666666666667e-06, "loss": 0.2575, "num_tokens": 396875.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 23.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.005828341469168663, "kl": 0.003788012661971152, "learning_rate": 4.8411111111111115e-06, "loss": 0.0002, "num_tokens": 397187.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 83.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 83.75, "completions/mean_terminated_length": 26.33333396911621, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 23.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.700582981109619, "kl": 0.043640341609716415, "learning_rate": 4.840555555555556e-06, "loss": 0.5472, "num_tokens": 397774.0, "reward": 3.924999952316284, "reward_std": 4.1620306968688965, "rewards/reward_combined/mean": 3.924999952316284, "rewards/reward_combined/std": 4.1620306968688965, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 23.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11955816298723221, "kl": 0.0187058262526989, "learning_rate": 4.84e-06, "loss": 0.001, "num_tokens": 398075.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 23.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06325367838144302, "kl": 0.021864749491214752, "learning_rate": 4.839444444444445e-06, "loss": 0.0011, "num_tokens": 398409.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.08604725450277328, "kl": 0.010428740177303553, "learning_rate": 4.838888888888889e-06, "loss": 0.0005, "num_tokens": 398697.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 23.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 5.156263828277588, "kl": 0.059210372157394886, "learning_rate": 4.838333333333334e-06, "loss": 0.2485, "num_tokens": 399063.0, "reward": 1.75, "reward_std": 2.0615527629852295, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 2.0615527629852295, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 23.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.09864811599254608, "kl": 0.015690837986767292, "learning_rate": 4.837777777777778e-06, "loss": 0.0007, "num_tokens": 399326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 23.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04078830033540726, "kl": 0.004249109362717718, "learning_rate": 4.837222222222223e-06, "loss": 0.0002, "num_tokens": 399561.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 23.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.020550644025206566, "kl": 0.013283679261803627, "learning_rate": 4.836666666666667e-06, "loss": 0.0007, "num_tokens": 399833.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 24.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.058878980576992035, "kl": 0.009438025299459696, "learning_rate": 4.836111111111112e-06, "loss": 0.0005, "num_tokens": 400115.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.058050282299518585, "kl": 0.00630892557092011, "learning_rate": 4.835555555555556e-06, "loss": 0.0003, "num_tokens": 400401.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 24.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.058209359645843506, "kl": 0.007736926199868321, "learning_rate": 4.835e-06, "loss": 0.0004, "num_tokens": 400674.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 24.055555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 5.326216220855713, "kl": 0.21580228209495544, "learning_rate": 4.834444444444445e-06, "loss": 0.0376, "num_tokens": 400972.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 24.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.08047948032617569, "kl": 0.02763940393924713, "learning_rate": 4.833888888888889e-06, "loss": 0.0014, "num_tokens": 401240.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.10081706196069717, "kl": 0.00686851586215198, "learning_rate": 4.833333333333333e-06, "loss": 0.0004, "num_tokens": 401540.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 77.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 77.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 24.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.6226149797439575, "kl": 0.021918222308158875, "learning_rate": 4.832777777777778e-06, "loss": 0.434, "num_tokens": 402098.0, "reward": 2.875, "reward_std": 2.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 2.25, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.5, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 24.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.9477072954177856, "kl": 0.05433430336415768, "learning_rate": 4.832222222222223e-06, "loss": 0.0034, "num_tokens": 402568.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 24.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.2530475854873657, "kl": 0.03089323587482795, "learning_rate": 4.831666666666667e-06, "loss": 0.0021, "num_tokens": 402813.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 24.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.008647313341498375, "kl": 0.0026513487100601196, "learning_rate": 4.831111111111112e-06, "loss": 0.0001, "num_tokens": 403025.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 24.185185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 3.7780637741088867, "kl": 0.027485931292176247, "learning_rate": 4.830555555555556e-06, "loss": 0.0155, "num_tokens": 403358.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 24.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.08530470728874207, "kl": 0.0156544903293252, "learning_rate": 4.83e-06, "loss": 0.0007, "num_tokens": 403635.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 24.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.5035592913627625, "kl": 0.22710295021533966, "learning_rate": 4.829444444444445e-06, "loss": 0.0113, "num_tokens": 403975.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 24.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014849298167973757, "kl": 8.463859558105469e-06, "learning_rate": 4.828888888888889e-06, "loss": 0.0, "num_tokens": 404195.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 24.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00961595680564642, "kl": 0.0011733942665159702, "learning_rate": 4.828333333333333e-06, "loss": 0.0001, "num_tokens": 404517.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 24.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.02660864032804966, "kl": 0.003213491290807724, "learning_rate": 4.827777777777778e-06, "loss": 0.0002, "num_tokens": 404777.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 24.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.3551636040210724, "kl": 0.1597903072834015, "learning_rate": 4.827222222222223e-06, "loss": 0.0076, "num_tokens": 405049.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 24.314814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 8.323555946350098, "kl": 0.2236951533704996, "learning_rate": 4.826666666666667e-06, "loss": 0.0658, "num_tokens": 405369.0, "reward": 4.875, "reward_std": 3.75, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.75, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.333333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 4.757115840911865, "kl": 0.07386230118572712, "learning_rate": 4.826111111111112e-06, "loss": -0.069, "num_tokens": 405658.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 24.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.634603977203369, "kl": 0.06220138631761074, "learning_rate": 4.825555555555556e-06, "loss": 0.2307, "num_tokens": 406054.0, "reward": 3.625, "reward_std": 4.516174793243408, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 4.516174793243408, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 24.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 8.015656471252441, "kl": 0.05278782732784748, "learning_rate": 4.825e-06, "loss": 0.1827, "num_tokens": 406312.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 24.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.011038083583116531, "kl": 0.01653577946126461, "learning_rate": 4.824444444444445e-06, "loss": 0.0008, "num_tokens": 406572.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 24.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 8.02423095703125, "kl": 0.010849637212231755, "learning_rate": 4.823888888888889e-06, "loss": 0.334, "num_tokens": 406866.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 24.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.1808186173439026, "kl": 0.15812218189239502, "learning_rate": 4.8233333333333335e-06, "loss": 0.0079, "num_tokens": 407166.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 24.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.012740233913064003, "kl": 0.004528794903308153, "learning_rate": 4.822777777777779e-06, "loss": 0.0002, "num_tokens": 407446.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 24.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024575034622102976, "kl": 0.09065032377839088, "learning_rate": 4.822222222222222e-06, "loss": 0.0045, "num_tokens": 407810.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 24.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 38.712615966796875, "kl": 1.7851112931966782, "learning_rate": 4.821666666666667e-06, "loss": 0.113, "num_tokens": 408158.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 24.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03509138152003288, "kl": 0.006908688927069306, "learning_rate": 4.821111111111112e-06, "loss": 0.0003, "num_tokens": 408462.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 24.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0202711820602417, "kl": 0.012693223543465137, "learning_rate": 4.820555555555556e-06, "loss": 0.0006, "num_tokens": 408778.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 24.537037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.0554535388946533, "kl": 0.03575649418053217, "learning_rate": 4.8200000000000004e-06, "loss": -0.0211, "num_tokens": 409082.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 24.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.1524924337863922, "kl": 0.022596178576350212, "learning_rate": 4.819444444444445e-06, "loss": 0.0011, "num_tokens": 409338.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 24.574074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 4.191066741943359, "kl": 0.04491184465587139, "learning_rate": 4.818888888888889e-06, "loss": -0.1339, "num_tokens": 409653.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 24.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.015817591920495033, "kl": 0.0007918477058410645, "learning_rate": 4.8183333333333335e-06, "loss": 0.0, "num_tokens": 409865.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 24.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.4964447021484375, "kl": 0.05818246677517891, "learning_rate": 4.817777777777779e-06, "loss": 0.1037, "num_tokens": 410209.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 24.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.7568123936653137, "kl": 0.10170341655611992, "learning_rate": 4.817222222222222e-06, "loss": 0.0055, "num_tokens": 410475.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 24.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.010240979492664337, "kl": 0.0005221247702138498, "learning_rate": 4.816666666666667e-06, "loss": 0.0, "num_tokens": 410695.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 24.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 8.587267875671387, "kl": 0.010342339053750038, "learning_rate": 4.816111111111112e-06, "loss": 0.2347, "num_tokens": 410938.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 24.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.005187053233385086, "kl": 0.0074772909283638, "learning_rate": 4.815555555555556e-06, "loss": 0.0004, "num_tokens": 411250.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 24.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.44820114970207214, "kl": 0.10145938955247402, "learning_rate": 4.8150000000000005e-06, "loss": 0.0053, "num_tokens": 411558.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 24.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.477147102355957, "kl": 0.10505785793066025, "learning_rate": 4.814444444444445e-06, "loss": 0.003, "num_tokens": 411922.0, "reward": 3.25, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 0.28867512941360474, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 24.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07914198189973831, "kl": 0.008377711987122893, "learning_rate": 4.813888888888889e-06, "loss": 0.0004, "num_tokens": 412185.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 24.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05416494607925415, "kl": 0.004517280962318182, "learning_rate": 4.8133333333333336e-06, "loss": 0.0002, "num_tokens": 412513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 24.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.2074781060218811, "kl": 0.018534278497099876, "learning_rate": 4.812777777777779e-06, "loss": 0.0011, "num_tokens": 412797.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 24.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.4081047475337982, "kl": 0.04496469534933567, "learning_rate": 4.812222222222222e-06, "loss": 0.0022, "num_tokens": 413026.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 24.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.08944093436002731, "kl": 0.011182056274265051, "learning_rate": 4.8116666666666675e-06, "loss": 0.0006, "num_tokens": 413317.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.75, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 73.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.833333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 1.6346852779388428, "kl": 0.014952249825000763, "learning_rate": 4.811111111111111e-06, "loss": 0.3776, "num_tokens": 413832.0, "reward": 5.75, "reward_std": 3.8405728340148926, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.8405728340148926, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 24.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.18161153793335, "kl": 0.19288471341133118, "learning_rate": 4.810555555555556e-06, "loss": 0.2763, "num_tokens": 414181.0, "reward": 2.299999952316284, "reward_std": 3.802630662918091, "rewards/reward_combined/mean": 2.299999952316284, "rewards/reward_combined/std": 3.8026304244995117, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 24.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05178670585155487, "kl": 0.02894682064652443, "learning_rate": 4.8100000000000005e-06, "loss": 0.0015, "num_tokens": 414469.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 24.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07594513148069382, "kl": 0.006707837805151939, "learning_rate": 4.809444444444445e-06, "loss": 0.0003, "num_tokens": 414739.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 24.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.003877446986734867, "kl": 9.447336196899414e-06, "learning_rate": 4.808888888888889e-06, "loss": 0.0, "num_tokens": 414943.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 24.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.887001633644104, "kl": 0.02898476365953684, "learning_rate": 4.808333333333334e-06, "loss": 0.0003, "num_tokens": 415231.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 24.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986698791384697, "kl": 0.007300443132407963, "learning_rate": 4.807777777777779e-06, "loss": 0.0004, "num_tokens": 415543.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 24.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 5.447276592254639, "kl": 0.12003881111741066, "learning_rate": 4.807222222222222e-06, "loss": -0.0202, "num_tokens": 415844.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 24.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.10451795905828476, "kl": 0.0070509344805032015, "learning_rate": 4.8066666666666675e-06, "loss": 0.0004, "num_tokens": 416108.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.26017075777053833, "kl": 0.01896486582700163, "learning_rate": 4.806111111111111e-06, "loss": 0.001, "num_tokens": 416406.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 25.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0594012588262558, "kl": 0.002209317754022777, "learning_rate": 4.805555555555556e-06, "loss": 0.0001, "num_tokens": 416662.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 25.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.21676333248615265, "kl": 0.08237233338877559, "learning_rate": 4.805000000000001e-06, "loss": 0.0025, "num_tokens": 416956.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 25.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.03953962400555611, "kl": 0.009102823212742805, "learning_rate": 4.804444444444445e-06, "loss": 0.0004, "num_tokens": 417250.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 25.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.008472730405628681, "kl": 0.016896571964025497, "learning_rate": 4.803888888888889e-06, "loss": 0.0008, "num_tokens": 417510.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 25.09259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 2.9708733558654785, "kl": 0.022063281387090683, "learning_rate": 4.803333333333334e-06, "loss": -0.0538, "num_tokens": 417764.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1018245667219162, "kl": 0.007839980724384077, "learning_rate": 4.802777777777778e-06, "loss": 0.0003, "num_tokens": 417985.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 58.25, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 25.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.3758277893066406, "kl": 0.07929578237235546, "learning_rate": 4.802222222222222e-06, "loss": 0.1747, "num_tokens": 418442.0, "reward": 2.875, "reward_std": 3.8810436725616455, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.8810436725616455, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.1281607151031494, "kl": 0.009591304929926991, "learning_rate": 4.8016666666666676e-06, "loss": -0.077, "num_tokens": 418729.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1358 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.014705882407724857, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014705882407724857, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 25.166666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 5.364843368530273, "kl": 0.02136917132884264, "learning_rate": 4.801111111111111e-06, "loss": -0.1183, "num_tokens": 419009.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 25.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.019755007699131966, "kl": 0.00433156022336334, "learning_rate": 4.800555555555556e-06, "loss": 0.0002, "num_tokens": 419318.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059961420483887196, "kl": 0.010314032435417175, "learning_rate": 4.800000000000001e-06, "loss": 0.0005, "num_tokens": 419554.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 25.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.07670897990465164, "kl": 0.0302742812782526, "learning_rate": 4.799444444444445e-06, "loss": 0.0015, "num_tokens": 419883.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 25.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.013027582317590714, "kl": 0.0007085800170898438, "learning_rate": 4.798888888888889e-06, "loss": 0.0, "num_tokens": 420143.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.2057649940252304, "kl": 0.010949065908789635, "learning_rate": 4.798333333333334e-06, "loss": 0.0006, "num_tokens": 420356.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 25.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.6654880046844482, "kl": 0.11440582200884819, "learning_rate": 4.797777777777778e-06, "loss": 0.0051, "num_tokens": 420708.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 25.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 4.349661350250244, "kl": 0.027234064415097237, "learning_rate": 4.797222222222222e-06, "loss": -0.02, "num_tokens": 421066.0, "reward": 1.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 25.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.058144573122262955, "kl": 0.010552361607551575, "learning_rate": 4.796666666666668e-06, "loss": 0.0005, "num_tokens": 421361.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 25.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.21368466317653656, "kl": 0.0480164997279644, "learning_rate": 4.796111111111111e-06, "loss": 0.0023, "num_tokens": 421677.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 25.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.13686755299568176, "kl": 0.008784756064414978, "learning_rate": 4.795555555555556e-06, "loss": 0.0004, "num_tokens": 421889.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 25.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05204093083739281, "kl": 0.0027319056680426, "learning_rate": 4.795e-06, "loss": 0.0001, "num_tokens": 422159.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 25.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07700635492801666, "kl": 0.006906124850502238, "learning_rate": 4.794444444444445e-06, "loss": 0.0003, "num_tokens": 422419.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 25.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.08548451960086823, "kl": 0.006750643020495772, "learning_rate": 4.793888888888889e-06, "loss": 0.0003, "num_tokens": 422692.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.1388915628194809, "kl": 0.09178338572382927, "learning_rate": 4.793333333333334e-06, "loss": 0.0046, "num_tokens": 422984.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 25.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 6.325643539428711, "kl": 0.06225571595132351, "learning_rate": 4.792777777777778e-06, "loss": -0.0269, "num_tokens": 423290.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 25.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.12637831270694733, "kl": 0.06931752525269985, "learning_rate": 4.7922222222222225e-06, "loss": 0.0038, "num_tokens": 423590.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1375 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.014705882407724857, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014705882407724857, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 25.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.544632911682129, "kl": 0.02149984799325466, "learning_rate": 4.791666666666668e-06, "loss": -0.0018, "num_tokens": 423882.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 25.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.010125894099473953, "kl": 0.0009772772900760174, "learning_rate": 4.791111111111111e-06, "loss": 0.0, "num_tokens": 424205.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 25.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 8.03516960144043, "kl": 0.09219470247626305, "learning_rate": 4.790555555555556e-06, "loss": 0.0533, "num_tokens": 424520.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 25.537037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 4.356597900390625, "kl": 0.14101029559969902, "learning_rate": 4.79e-06, "loss": 0.0075, "num_tokens": 424867.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 25.555555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 4.566723823547363, "kl": 0.12090682983398438, "learning_rate": 4.789444444444445e-06, "loss": 0.2129, "num_tokens": 425196.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 25.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.10315383225679398, "kl": 0.016671439167112112, "learning_rate": 4.7888888888888894e-06, "loss": 0.0008, "num_tokens": 425486.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.32094600796699524, "kl": 0.017856448888778687, "learning_rate": 4.788333333333334e-06, "loss": 0.0009, "num_tokens": 425698.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 25.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08610950410366058, "kl": 0.0076073152013123035, "learning_rate": 4.787777777777778e-06, "loss": 0.0004, "num_tokens": 426030.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 25.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.4160261154174805, "kl": 0.07191843539476395, "learning_rate": 4.7872222222222225e-06, "loss": 0.032, "num_tokens": 426342.0, "reward": 1.625, "reward_std": 1.25, "rewards/reward_combined/mean": 1.625, "rewards/reward_combined/std": 1.25, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03770093619823456, "kl": 0.006230844766832888, "learning_rate": 4.786666666666667e-06, "loss": 0.0003, "num_tokens": 426621.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 25.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.013898899778723717, "kl": 0.003854731097817421, "learning_rate": 4.786111111111111e-06, "loss": 0.0002, "num_tokens": 426899.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 25.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.08048436045646667, "kl": 0.011429541744291782, "learning_rate": 4.785555555555556e-06, "loss": 0.0006, "num_tokens": 427182.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 96.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 96.25, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 25.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 1.4099245071411133, "kl": 0.18618559837341309, "learning_rate": 4.785e-06, "loss": 0.381, "num_tokens": 427847.0, "reward": 1.4749999046325684, "reward_std": 2.5447659492492676, "rewards/reward_combined/mean": 1.4749999046325684, "rewards/reward_combined/std": 2.5447657108306885, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 25.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.024550247937440872, "kl": 0.001674121362157166, "learning_rate": 4.784444444444445e-06, "loss": 0.0001, "num_tokens": 428132.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 25.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0663977637887001, "kl": 0.002258288790471852, "learning_rate": 4.783888888888889e-06, "loss": 0.0001, "num_tokens": 428396.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 25.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04494475573301315, "kl": 0.03300418704748154, "learning_rate": 4.783333333333334e-06, "loss": 0.0017, "num_tokens": 428728.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 25.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.03756489232182503, "kl": 0.004025356407510117, "learning_rate": 4.782777777777778e-06, "loss": 0.0002, "num_tokens": 428963.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 25.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.003275322960689664, "kl": 0.09044184908270836, "learning_rate": 4.7822222222222226e-06, "loss": 0.0045, "num_tokens": 429327.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 25.814814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 4.610541820526123, "kl": 0.013432672712951899, "learning_rate": 4.781666666666667e-06, "loss": -0.0386, "num_tokens": 429631.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.09281621128320694, "kl": 0.02983465977013111, "learning_rate": 4.781111111111111e-06, "loss": 0.0015, "num_tokens": 429867.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 25.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1655174195766449, "kl": 0.025650465860962868, "learning_rate": 4.7805555555555565e-06, "loss": 0.0013, "num_tokens": 430153.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 25.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04530677571892738, "kl": 0.019586976617574692, "learning_rate": 4.78e-06, "loss": 0.001, "num_tokens": 430469.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 25.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 7.208233833312988, "kl": 0.04585418710485101, "learning_rate": 4.779444444444445e-06, "loss": 0.1319, "num_tokens": 430737.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 25.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.034935470670461655, "kl": 0.004299575462937355, "learning_rate": 4.778888888888889e-06, "loss": 0.0002, "num_tokens": 431049.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 25.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.580291271209717, "kl": 0.05599336139857769, "learning_rate": 4.778333333333334e-06, "loss": 0.1632, "num_tokens": 431418.0, "reward": 4.875, "reward_std": 3.1983067989349365, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.1983067989349365, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 25.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015968599473126233, "kl": 8.456408977508545e-06, "learning_rate": 4.777777777777778e-06, "loss": 0.0, "num_tokens": 431638.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.5, "completions/clipped_ratio": 0.25, "completions/max_length": 195.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 73.5, "completions/mean_terminated_length": 33.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 25.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 1.791509747505188, "kl": 0.05263173207640648, "learning_rate": 4.777222222222223e-06, "loss": 0.3519, "num_tokens": 432168.0, "reward": 2.0, "reward_std": 3.0, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 3.0, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 25.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07807697355747223, "kl": 0.011212260811589658, "learning_rate": 4.776666666666667e-06, "loss": 0.0005, "num_tokens": 432475.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 26.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.916395664215088, "kl": 0.08186089992523193, "learning_rate": 4.776111111111111e-06, "loss": -0.0489, "num_tokens": 432741.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 26.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.6992180347442627, "kl": 0.24558618664741516, "learning_rate": 4.775555555555556e-06, "loss": 0.011, "num_tokens": 433064.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 26.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.21960239112377167, "kl": 0.05686299875378609, "learning_rate": 4.775e-06, "loss": 0.0028, "num_tokens": 433396.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 26.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.06568842381238937, "kl": 0.010613234248012304, "learning_rate": 4.774444444444445e-06, "loss": 0.0005, "num_tokens": 433686.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 26.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.024146003648638725, "kl": 0.0016347646014764905, "learning_rate": 4.773888888888889e-06, "loss": 0.0001, "num_tokens": 434013.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 26.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.1807921975851059, "kl": 0.047323331236839294, "learning_rate": 4.773333333333334e-06, "loss": 0.0028, "num_tokens": 434419.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 26.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.007721307221800089, "kl": 0.017042269930243492, "learning_rate": 4.772777777777778e-06, "loss": 0.0009, "num_tokens": 434679.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 26.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.7663779258728027, "kl": 0.006162696052342653, "learning_rate": 4.772222222222223e-06, "loss": -0.002, "num_tokens": 434951.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.016194235533475876, "kl": 0.0009200513013638556, "learning_rate": 4.771666666666667e-06, "loss": 0.0, "num_tokens": 435205.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017743531498126686, "kl": 1.0654330253601074e-05, "learning_rate": 4.771111111111111e-06, "loss": 0.0, "num_tokens": 435425.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 26.185185185185187, "frac_reward_zero_std": 0.0, "grad_norm": 4.031007766723633, "kl": 0.08223836869001389, "learning_rate": 4.770555555555556e-06, "loss": 0.1817, "num_tokens": 435834.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0064307269640266895, "kl": 0.010267265141010284, "learning_rate": 4.77e-06, "loss": 0.0005, "num_tokens": 436070.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.1661919504404068, "kl": 0.030985038727521896, "learning_rate": 4.769444444444445e-06, "loss": 0.0015, "num_tokens": 436313.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 26.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.279727578163147, "kl": 0.09183253347873688, "learning_rate": 4.768888888888889e-06, "loss": -0.0477, "num_tokens": 436755.0, "reward": 2.049999952316284, "reward_std": 1.4177446365356445, "rewards/reward_combined/mean": 2.049999952316284, "rewards/reward_combined/std": 1.417744755744934, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 26.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.21548384428024292, "kl": 0.0240742398891598, "learning_rate": 4.768333333333334e-06, "loss": 0.0013, "num_tokens": 437069.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 26.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0731562152504921, "kl": 0.009372655302286148, "learning_rate": 4.767777777777778e-06, "loss": 0.0005, "num_tokens": 437329.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 26.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.047177329659461975, "kl": 0.0021621957421302795, "learning_rate": 4.767222222222223e-06, "loss": 0.0001, "num_tokens": 437537.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.012500000186264515, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 26.314814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 4.1220784187316895, "kl": 0.04953158367425203, "learning_rate": 4.766666666666667e-06, "loss": -0.0183, "num_tokens": 437842.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 26.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.1273045390844345, "kl": 0.1112312376499176, "learning_rate": 4.766111111111111e-06, "loss": 0.0055, "num_tokens": 438151.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 26.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1413971334695816, "kl": 0.011386359576135874, "learning_rate": 4.765555555555556e-06, "loss": 0.0007, "num_tokens": 438375.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 26.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.046707239001989365, "kl": 0.007389646023511887, "learning_rate": 4.765e-06, "loss": 0.0004, "num_tokens": 438639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 26.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.11133342236280441, "kl": 0.0026464557740837336, "learning_rate": 4.7644444444444445e-06, "loss": 0.0002, "num_tokens": 438854.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 26.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028407122008502483, "kl": 0.09055589884519577, "learning_rate": 4.763888888888889e-06, "loss": 0.0045, "num_tokens": 439218.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 26.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.017393531277775764, "kl": 0.018343567848205566, "learning_rate": 4.763333333333334e-06, "loss": 0.0009, "num_tokens": 439510.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 26.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 4.87799072265625, "kl": 0.0201865890994668, "learning_rate": 4.762777777777778e-06, "loss": 0.1605, "num_tokens": 439804.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 26.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.368156671524048, "kl": 0.010497572366148233, "learning_rate": 4.762222222222223e-06, "loss": 0.1242, "num_tokens": 440126.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 1429 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.012820512987673283, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 26.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.6528635025024414, "kl": 0.15914630144834518, "learning_rate": 4.761666666666667e-06, "loss": -0.0199, "num_tokens": 440428.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 26.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.12586598098278046, "kl": 0.015595397911965847, "learning_rate": 4.7611111111111115e-06, "loss": 0.0008, "num_tokens": 440718.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 26.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009825220331549644, "kl": 0.002120271325111389, "learning_rate": 4.760555555555556e-06, "loss": 0.0001, "num_tokens": 440930.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 26.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.014790266752243042, "kl": 0.0016600601375102997, "learning_rate": 4.76e-06, "loss": 0.0001, "num_tokens": 441174.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 26.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.006252662744373083, "kl": 0.000594919896684587, "learning_rate": 4.7594444444444445e-06, "loss": 0.0, "num_tokens": 441430.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 26.574074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 5.6178741455078125, "kl": 0.07513842731714249, "learning_rate": 4.758888888888889e-06, "loss": 0.1847, "num_tokens": 441760.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 26.59259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 5.2885518074035645, "kl": 0.05875059962272644, "learning_rate": 4.758333333333334e-06, "loss": -0.107, "num_tokens": 442078.0, "reward": 4.75, "reward_std": 2.986078977584839, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 2.986078977584839, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 26.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.158095359802246, "kl": 0.38305389136075974, "learning_rate": 4.7577777777777784e-06, "loss": 0.0027, "num_tokens": 442345.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 26.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08432530611753464, "kl": 0.014264136203564703, "learning_rate": 4.757222222222223e-06, "loss": 0.0008, "num_tokens": 442645.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 26.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.2782675623893738, "kl": 0.11326053738594055, "learning_rate": 4.756666666666667e-06, "loss": 0.0053, "num_tokens": 442954.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 26.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.062268905341625214, "kl": 0.003308618499431759, "learning_rate": 4.7561111111111115e-06, "loss": 0.0002, "num_tokens": 443188.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 26.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.5832986831665039, "kl": 0.09798285830765963, "learning_rate": 4.755555555555556e-06, "loss": 0.0057, "num_tokens": 443515.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 26.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.21705196797847748, "kl": 0.05368417501449585, "learning_rate": 4.755e-06, "loss": 0.0028, "num_tokens": 443807.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 26.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.013702424243092537, "kl": 0.00031940240296535194, "learning_rate": 4.7544444444444446e-06, "loss": 0.0, "num_tokens": 444077.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 26.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.2406647503376007, "kl": 0.016167378053069115, "learning_rate": 4.75388888888889e-06, "loss": 0.0008, "num_tokens": 444333.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 26.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07567556947469711, "kl": 0.033681513741612434, "learning_rate": 4.753333333333333e-06, "loss": 0.0017, "num_tokens": 444665.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 26.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.2069149017333984, "kl": 0.07571517676115036, "learning_rate": 4.7527777777777785e-06, "loss": 0.0126, "num_tokens": 445011.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 26.796296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 3.806495189666748, "kl": 0.016193844843655825, "learning_rate": 4.752222222222223e-06, "loss": 0.0398, "num_tokens": 445347.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 26.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.14792540669441223, "kl": 0.018257134594023228, "learning_rate": 4.751666666666667e-06, "loss": 0.0011, "num_tokens": 445618.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 26.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.08121483772993088, "kl": 0.01737086521461606, "learning_rate": 4.7511111111111116e-06, "loss": 0.0009, "num_tokens": 445974.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 26.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.4960875511169434, "kl": 0.13234667479991913, "learning_rate": 4.750555555555556e-06, "loss": -0.0091, "num_tokens": 446281.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 1450 }, { "clip_ratio/high_max": 0.004854368977248669, "clip_ratio/high_mean": 0.004854368977248669, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004854368977248669, "completion_length": 99.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 99.5, "completions/mean_terminated_length": 47.333335876464844, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 26.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.9882006645202637, "kl": 0.05529517121613026, "learning_rate": 4.75e-06, "loss": 0.3811, "num_tokens": 446931.0, "reward": 0.125, "reward_std": 2.4958298206329346, "rewards/reward_combined/mean": 0.125, "rewards/reward_combined/std": 2.4958298206329346, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 26.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.002585447859019041, "kl": 0.003461186308413744, "learning_rate": 4.749444444444445e-06, "loss": 0.0002, "num_tokens": 447211.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 26.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.743048369884491, "kl": 0.08763420954346657, "learning_rate": 4.74888888888889e-06, "loss": 0.0043, "num_tokens": 447547.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 26.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 4.2896928787231445, "kl": 0.0894244834780693, "learning_rate": 4.748333333333333e-06, "loss": 0.0582, "num_tokens": 447822.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 26.944444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 4.263548851013184, "kl": 0.008489423664286733, "learning_rate": 4.7477777777777785e-06, "loss": 0.0602, "num_tokens": 448096.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 26.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.10919497162103653, "kl": 0.02098829112946987, "learning_rate": 4.747222222222223e-06, "loss": 0.001, "num_tokens": 448368.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 26.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0866292342543602, "kl": 0.010402880143374205, "learning_rate": 4.746666666666667e-06, "loss": 0.0006, "num_tokens": 448685.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04763944819569588, "kl": 0.006324923364445567, "learning_rate": 4.746111111111112e-06, "loss": 0.0003, "num_tokens": 448967.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 27.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.088446140289307, "kl": 0.22730765491724014, "learning_rate": 4.745555555555556e-06, "loss": -0.1511, "num_tokens": 449261.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.06564030796289444, "kl": 0.012959839310497046, "learning_rate": 4.745e-06, "loss": 0.0007, "num_tokens": 449561.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 27.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.02814163826406002, "kl": 0.002249977318570018, "learning_rate": 4.744444444444445e-06, "loss": 0.0001, "num_tokens": 449797.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.009259259328246117, "clip_ratio/high_mean": 0.009259259328246117, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009259259328246117, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 27.074074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.960486650466919, "kl": 0.09077312797307968, "learning_rate": 4.74388888888889e-06, "loss": 0.1291, "num_tokens": 450126.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 1462 }, { "clip_ratio/high_max": 0.009259259328246117, "clip_ratio/high_mean": 0.009259259328246117, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009259259328246117, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 27.09259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 4.840166091918945, "kl": 0.11161916330456734, "learning_rate": 4.743333333333333e-06, "loss": 0.0821, "num_tokens": 450440.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 27.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.015540825203061104, "kl": 0.0041747428476810455, "learning_rate": 4.742777777777779e-06, "loss": 0.0002, "num_tokens": 450692.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 27.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.9200214743614197, "kl": 0.18690890446305275, "learning_rate": 4.742222222222222e-06, "loss": 0.0096, "num_tokens": 450992.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 27.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.050471652299165726, "kl": 0.017459758557379246, "learning_rate": 4.741666666666667e-06, "loss": 0.0009, "num_tokens": 451304.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 27.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.05648152530193329, "kl": 0.015830250456929207, "learning_rate": 4.741111111111112e-06, "loss": 0.0008, "num_tokens": 451620.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.3469277024269104, "kl": 0.06487790122628212, "learning_rate": 4.740555555555556e-06, "loss": 0.0032, "num_tokens": 451900.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 27.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.011828791350126266, "kl": 0.0033006101148203015, "learning_rate": 4.74e-06, "loss": 0.0001, "num_tokens": 452199.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 27.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.10720457881689072, "kl": 0.0325452359393239, "learning_rate": 4.739444444444445e-06, "loss": 0.0016, "num_tokens": 452520.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 27.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.2968435287475586, "kl": 0.9372318014502525, "learning_rate": 4.73888888888889e-06, "loss": 0.0345, "num_tokens": 452883.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 27.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.015925126150250435, "kl": 0.0004553869366645813, "learning_rate": 4.7383333333333334e-06, "loss": 0.0, "num_tokens": 453095.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 27.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.19456353783607483, "kl": 0.047659204341471195, "learning_rate": 4.737777777777779e-06, "loss": 0.0024, "num_tokens": 453419.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 27.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.41592493653297424, "kl": 0.04608049616217613, "learning_rate": 4.737222222222222e-06, "loss": 0.0028, "num_tokens": 453683.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 27.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.15069033205509186, "kl": 0.023021024651825428, "learning_rate": 4.736666666666667e-06, "loss": 0.0012, "num_tokens": 453971.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 27.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.021215755492448807, "kl": 0.016359763219952583, "learning_rate": 4.736111111111112e-06, "loss": 0.0008, "num_tokens": 454243.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09248959273099899, "kl": 0.007572232978418469, "learning_rate": 4.735555555555556e-06, "loss": 0.0004, "num_tokens": 454513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 27.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012314347259234637, "kl": 6.556510925292969e-06, "learning_rate": 4.735e-06, "loss": 0.0, "num_tokens": 454733.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.021739130839705467, "clip_ratio/low_min": 0.021739130839705467, "clip_ratio/region_mean": 0.021739130839705467, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 8.704748153686523, "kl": 0.06571188941597939, "learning_rate": 4.734444444444445e-06, "loss": 0.0107, "num_tokens": 454996.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 27.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.17208848893642426, "kl": 0.11668592691421509, "learning_rate": 4.733888888888889e-06, "loss": 0.0057, "num_tokens": 455319.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.3344695270061493, "kl": 0.09724758192896843, "learning_rate": 4.7333333333333335e-06, "loss": 0.0052, "num_tokens": 455590.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 27.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.11077798157930374, "kl": 0.03313850797712803, "learning_rate": 4.732777777777779e-06, "loss": 0.0016, "num_tokens": 455861.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 27.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.5204050540924072, "kl": 0.058114251121878624, "learning_rate": 4.732222222222222e-06, "loss": 0.0037, "num_tokens": 456252.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 27.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08995319902896881, "kl": 0.0048093050718307495, "learning_rate": 4.731666666666667e-06, "loss": 0.0002, "num_tokens": 456464.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 27.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.029987799003720284, "kl": 0.004504490876570344, "learning_rate": 4.731111111111112e-06, "loss": 0.0002, "num_tokens": 456766.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 27.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.021055251359939575, "kl": 0.017238127999007702, "learning_rate": 4.730555555555556e-06, "loss": 0.0009, "num_tokens": 457058.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 27.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.8563486933708191, "kl": 0.08245152235031128, "learning_rate": 4.7300000000000005e-06, "loss": 0.0045, "num_tokens": 457338.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 27.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.018166547641158104, "kl": 0.056596847251057625, "learning_rate": 4.729444444444445e-06, "loss": 0.0028, "num_tokens": 457790.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 27.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077817244455218315, "kl": 0.009928032755851746, "learning_rate": 4.728888888888889e-06, "loss": 0.0005, "num_tokens": 458026.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 89.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 89.25, "completions/mean_terminated_length": 33.66666793823242, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 27.59259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 1.772735834121704, "kl": 0.039492045529186726, "learning_rate": 4.7283333333333335e-06, "loss": 0.4387, "num_tokens": 458607.0, "reward": 4.425000190734863, "reward_std": 3.797696828842163, "rewards/reward_combined/mean": 4.425000190734863, "rewards/reward_combined/std": 3.797696828842163, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 27.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.8735463619232178, "kl": 0.23625915497541428, "learning_rate": 4.727777777777779e-06, "loss": -0.0548, "num_tokens": 458915.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 27.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.009357519447803497, "kl": 0.0009869575151242316, "learning_rate": 4.727222222222222e-06, "loss": 0.0, "num_tokens": 459135.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 27.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04514807090163231, "kl": 0.012581298593431711, "learning_rate": 4.7266666666666674e-06, "loss": 0.0006, "num_tokens": 459423.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 27.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 10.886627197265625, "kl": 0.05266755819320679, "learning_rate": 4.726111111111111e-06, "loss": -0.0022, "num_tokens": 459651.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 27.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.1832783967256546, "kl": 0.01864550224854611, "learning_rate": 4.725555555555556e-06, "loss": 0.001, "num_tokens": 459913.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0071428571827709675, "clip_ratio/high_mean": 0.0071428571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0071428571827709675, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 27.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 3.0351274013519287, "kl": 0.12288649752736092, "learning_rate": 4.7250000000000005e-06, "loss": 0.1197, "num_tokens": 460240.0, "reward": 7.125, "reward_std": 0.75, "rewards/reward_combined/mean": 7.125, "rewards/reward_combined/std": 0.75, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 27.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.003119208849966526, "kl": 0.0018064723117277026, "learning_rate": 4.724444444444445e-06, "loss": 0.0001, "num_tokens": 460524.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 27.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.013989347033202648, "kl": 0.0015136376023292542, "learning_rate": 4.723888888888889e-06, "loss": 0.0001, "num_tokens": 460768.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 27.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06653355062007904, "kl": 0.023010283708572388, "learning_rate": 4.7233333333333336e-06, "loss": 0.0011, "num_tokens": 461102.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 27.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.03356068208813667, "kl": 0.00113907759077847, "learning_rate": 4.722777777777779e-06, "loss": 0.0001, "num_tokens": 461358.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 27.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.11700949817895889, "kl": 0.012202746933326125, "learning_rate": 4.722222222222222e-06, "loss": 0.0006, "num_tokens": 461692.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 27.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.004807391669601202, "kl": 0.0007115562912076712, "learning_rate": 4.7216666666666675e-06, "loss": 0.0, "num_tokens": 462013.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 27.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.15872126817703247, "kl": 0.06781024113297462, "learning_rate": 4.721111111111111e-06, "loss": 0.0034, "num_tokens": 462313.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 27.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05290369689464569, "kl": 0.005247580353170633, "learning_rate": 4.720555555555556e-06, "loss": 0.0002, "num_tokens": 462581.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 27.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.010343488305807114, "kl": 0.016129855066537857, "learning_rate": 4.7200000000000005e-06, "loss": 0.0008, "num_tokens": 462841.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 27.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.6947011947631836, "kl": 0.15766378119587898, "learning_rate": 4.719444444444445e-06, "loss": 0.0392, "num_tokens": 463195.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 27.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 5.373074531555176, "kl": 0.24574843421578407, "learning_rate": 4.718888888888889e-06, "loss": 0.0602, "num_tokens": 463500.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 27.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.030921733006834984, "kl": 0.00433326733764261, "learning_rate": 4.718333333333334e-06, "loss": 0.0002, "num_tokens": 463774.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 27.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.007151540368795395, "kl": 0.000277496874332428, "learning_rate": 4.717777777777778e-06, "loss": 0.0, "num_tokens": 464046.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 27.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0820825919508934, "kl": 0.004355892539024353, "learning_rate": 4.717222222222222e-06, "loss": 0.0002, "num_tokens": 464252.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 27.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 9.256240844726562, "kl": 0.03382151201367378, "learning_rate": 4.7166666666666675e-06, "loss": -0.1473, "num_tokens": 464519.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 28.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.005189281422644854, "kl": 0.003450115560553968, "learning_rate": 4.716111111111111e-06, "loss": 0.0002, "num_tokens": 464831.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 28.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.251321315765381, "kl": 0.23470714688301086, "learning_rate": 4.715555555555556e-06, "loss": -0.0059, "num_tokens": 465134.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 28.037037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 1.8581900596618652, "kl": 0.10721547901630402, "learning_rate": 4.715e-06, "loss": 0.0027, "num_tokens": 465485.0, "reward": 3.25, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 0.28867512941360474, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 28.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.023489201441407204, "kl": 0.016349329613149166, "learning_rate": 4.714444444444445e-06, "loss": 0.0008, "num_tokens": 465777.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 28.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 9.538557787891477e-05, "kl": 5.036592483520508e-06, "learning_rate": 4.713888888888889e-06, "loss": 0.0, "num_tokens": 465997.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 28.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.028324652463197708, "kl": 0.0025349780917167664, "learning_rate": 4.713333333333334e-06, "loss": 0.0001, "num_tokens": 466205.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 28.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.11059805750846863, "kl": 0.028930427506566048, "learning_rate": 4.712777777777778e-06, "loss": 0.0014, "num_tokens": 466537.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.008620689623057842, "clip_ratio/high_mean": 0.008620689623057842, "clip_ratio/low_mean": 0.006329114083200693, "clip_ratio/low_min": 0.006329114083200693, "clip_ratio/region_mean": 0.014949803706258535, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 28.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.9843549728393555, "kl": 0.05670413561165333, "learning_rate": 4.712222222222222e-06, "loss": -0.0023, "num_tokens": 466890.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 28.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.632441759109497, "kl": 0.13782588578760624, "learning_rate": 4.711666666666668e-06, "loss": -0.0861, "num_tokens": 467242.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 28.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.15759152173995972, "kl": 0.017504887655377388, "learning_rate": 4.711111111111111e-06, "loss": 0.0009, "num_tokens": 467496.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 28.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.018405111506581306, "kl": 0.01516400882974267, "learning_rate": 4.710555555555556e-06, "loss": 0.0008, "num_tokens": 467768.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 28.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 5.077430725097656, "kl": 0.016722829546779394, "learning_rate": 4.71e-06, "loss": -0.0289, "num_tokens": 468027.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 28.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.059521198272705, "kl": 0.016711488366127014, "learning_rate": 4.709444444444445e-06, "loss": 0.052, "num_tokens": 468364.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 28.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.006442556157708168, "kl": 0.0006509826052933931, "learning_rate": 4.708888888888889e-06, "loss": 0.0, "num_tokens": 468684.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 28.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06791870296001434, "kl": 0.058866800740361214, "learning_rate": 4.708333333333334e-06, "loss": 0.0032, "num_tokens": 468964.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 28.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.024985535070300102, "kl": 0.0005994424282107502, "learning_rate": 4.707777777777778e-06, "loss": 0.0, "num_tokens": 469232.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 28.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 5.8118720054626465, "kl": 0.05891389865428209, "learning_rate": 4.707222222222222e-06, "loss": 0.2906, "num_tokens": 469554.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 28.314814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 2.7744643688201904, "kl": 0.12136195600032806, "learning_rate": 4.706666666666667e-06, "loss": 0.0275, "num_tokens": 469886.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 28.333333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 5.9395270347595215, "kl": 0.042697624303400517, "learning_rate": 4.706111111111111e-06, "loss": 0.0627, "num_tokens": 470219.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 28.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01565675064921379, "kl": 0.03166612982749939, "learning_rate": 4.705555555555556e-06, "loss": 0.0016, "num_tokens": 470435.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05686667189002037, "kl": 0.006033208803273737, "learning_rate": 4.705e-06, "loss": 0.0003, "num_tokens": 470733.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 28.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.2245709896087646, "kl": 0.30529039446264505, "learning_rate": 4.704444444444445e-06, "loss": 0.0428, "num_tokens": 470994.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 28.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 11.2781343460083, "kl": 0.022389324731193483, "learning_rate": 4.703888888888889e-06, "loss": 0.0782, "num_tokens": 471216.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 28.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.3125435709953308, "kl": 0.05582912638783455, "learning_rate": 4.703333333333334e-06, "loss": 0.0028, "num_tokens": 471566.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 28.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 2.2808420658111572, "kl": 0.1612861508037895, "learning_rate": 4.702777777777778e-06, "loss": 0.0096, "num_tokens": 471844.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 28.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 5.5556721687316895, "kl": 0.14668408408761024, "learning_rate": 4.7022222222222225e-06, "loss": -0.0065, "num_tokens": 472145.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 28.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.10384046286344528, "kl": 0.015818612184375525, "learning_rate": 4.701666666666667e-06, "loss": 0.0008, "num_tokens": 472454.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 28.5, "frac_reward_zero_std": 0.0, "grad_norm": 7.119670391082764, "kl": 0.16098973900079727, "learning_rate": 4.701111111111111e-06, "loss": 0.0089, "num_tokens": 472766.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 28.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.025635335594415665, "kl": 0.0006484165787696838, "learning_rate": 4.700555555555556e-06, "loss": 0.0, "num_tokens": 472978.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 28.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.04605203494429588, "kl": 0.0025195926427841187, "learning_rate": 4.7e-06, "loss": 0.0001, "num_tokens": 473190.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.016753720119595528, "kl": 0.012664477340877056, "learning_rate": 4.699444444444445e-06, "loss": 0.0006, "num_tokens": 473462.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 28.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.6887959241867065, "kl": 0.12179721519351006, "learning_rate": 4.6988888888888895e-06, "loss": 0.0062, "num_tokens": 473755.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 28.59259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 1.7486634254455566, "kl": 0.051871391013264656, "learning_rate": 4.698333333333334e-06, "loss": 0.3174, "num_tokens": 474100.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 28.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.9035749435424805, "kl": 0.03490694286301732, "learning_rate": 4.697777777777778e-06, "loss": 0.3025, "num_tokens": 474422.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 28.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00867241807281971, "kl": 0.009834587574005127, "learning_rate": 4.6972222222222225e-06, "loss": 0.0005, "num_tokens": 474658.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 28.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 7.576884746551514, "kl": 0.10422676149755716, "learning_rate": 4.696666666666667e-06, "loss": 0.2644, "num_tokens": 474935.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 28.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.2191072404384613, "kl": 0.020568528212606907, "learning_rate": 4.696111111111111e-06, "loss": 0.0009, "num_tokens": 475241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 28.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.0192855391651392, "kl": 0.0005508005560841411, "learning_rate": 4.695555555555556e-06, "loss": 0.0, "num_tokens": 475497.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 28.703703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 2.227599620819092, "kl": 0.0939160380512476, "learning_rate": 4.695e-06, "loss": 0.0052, "num_tokens": 475829.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.18136021494865417, "kl": 0.05204852297902107, "learning_rate": 4.694444444444445e-06, "loss": 0.0026, "num_tokens": 476097.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 28.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.263349324464798, "kl": 0.03143658605404198, "learning_rate": 4.6938888888888895e-06, "loss": 0.0018, "num_tokens": 476355.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 28.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.21915017068386078, "kl": 0.045839522033929825, "learning_rate": 4.693333333333334e-06, "loss": 0.0023, "num_tokens": 476644.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 28.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 4.3386311531066895, "kl": 0.019332028459757566, "learning_rate": 4.692777777777778e-06, "loss": 0.1221, "num_tokens": 476934.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 28.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.17969204485416412, "kl": 0.039106784388422966, "learning_rate": 4.6922222222222226e-06, "loss": 0.0019, "num_tokens": 477248.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 28.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.05091001093387604, "kl": 0.01778921950608492, "learning_rate": 4.691666666666667e-06, "loss": 0.0009, "num_tokens": 477564.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 28.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.05992990732192993, "kl": 0.00798164214938879, "learning_rate": 4.691111111111111e-06, "loss": 0.0004, "num_tokens": 477854.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 28.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.022657297551631927, "kl": 0.002747099322732538, "learning_rate": 4.690555555555556e-06, "loss": 0.0001, "num_tokens": 478089.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 28.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.17392367124557495, "kl": 0.1001681461930275, "learning_rate": 4.69e-06, "loss": 0.0047, "num_tokens": 478365.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 28.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0046796640381217, "kl": 0.0897149033844471, "learning_rate": 4.689444444444445e-06, "loss": 0.0045, "num_tokens": 478729.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 28.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.035448797047138214, "kl": 0.011463688686490059, "learning_rate": 4.6888888888888895e-06, "loss": 0.0007, "num_tokens": 478978.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 28.925925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.050172194838523865, "kl": 0.009179504588246346, "learning_rate": 4.688333333333334e-06, "loss": 0.0004, "num_tokens": 479241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 28.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.018002422526478767, "kl": 0.002928142435848713, "learning_rate": 4.687777777777778e-06, "loss": 0.0001, "num_tokens": 479525.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 28.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.045376140624284744, "kl": 0.016566124744713306, "learning_rate": 4.687222222222223e-06, "loss": 0.0008, "num_tokens": 479837.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 28.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 8.414496421813965, "kl": 0.04490022920072079, "learning_rate": 4.686666666666667e-06, "loss": 0.1296, "num_tokens": 480169.0, "reward": 4.875, "reward_std": 3.5443618297576904, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.5443618297576904, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 29.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.02320653572678566, "kl": 0.058135055005550385, "learning_rate": 4.686111111111111e-06, "loss": 0.0029, "num_tokens": 480621.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 29.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 7.328979015350342, "kl": 0.03812351077795029, "learning_rate": 4.685555555555556e-06, "loss": 0.0645, "num_tokens": 480859.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 29.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.026389921084046364, "kl": 0.02707106526941061, "learning_rate": 4.685000000000001e-06, "loss": 0.0014, "num_tokens": 481185.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 29.055555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 5.460058689117432, "kl": 0.011236458085477352, "learning_rate": 4.684444444444444e-06, "loss": 0.1397, "num_tokens": 481454.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 29.074074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0792195051908493, "kl": 0.02236015535891056, "learning_rate": 4.68388888888889e-06, "loss": 0.0011, "num_tokens": 481811.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 29.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.04217391833662987, "kl": 0.017141496762633324, "learning_rate": 4.683333333333334e-06, "loss": 0.0009, "num_tokens": 482127.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 29.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.017659710720181465, "kl": 0.21858195960521698, "learning_rate": 4.682777777777778e-06, "loss": 0.0109, "num_tokens": 482431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 29.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05026821047067642, "kl": 0.061335181817412376, "learning_rate": 4.682222222222223e-06, "loss": 0.0031, "num_tokens": 482883.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 29.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.029550723731517792, "kl": 0.005892897606827319, "learning_rate": 4.681666666666667e-06, "loss": 0.0003, "num_tokens": 483183.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 29.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.005204958375543356, "kl": 0.00012739549492835067, "learning_rate": 4.681111111111111e-06, "loss": 0.0, "num_tokens": 483453.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 29.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.04024605453014374, "kl": 0.006967444438487291, "learning_rate": 4.680555555555556e-06, "loss": 0.0003, "num_tokens": 483758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 29.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.005568469408899546, "kl": 0.0005135759711265564, "learning_rate": 4.680000000000001e-06, "loss": 0.0, "num_tokens": 483970.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 29.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.26731348037719727, "kl": 0.039244771003723145, "learning_rate": 4.6794444444444444e-06, "loss": 0.0019, "num_tokens": 484242.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 29.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.046234190464019775, "kl": 0.005977684399113059, "learning_rate": 4.67888888888889e-06, "loss": 0.0003, "num_tokens": 484502.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 29.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.007959249429404736, "kl": 0.0024945277255028486, "learning_rate": 4.678333333333334e-06, "loss": 0.0001, "num_tokens": 484786.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 29.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.7501109838485718, "kl": 0.1324031502008438, "learning_rate": 4.677777777777778e-06, "loss": 0.0065, "num_tokens": 485077.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 29.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.012347083538770676, "kl": 0.001081898808479309, "learning_rate": 4.677222222222223e-06, "loss": 0.0001, "num_tokens": 485321.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 29.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.10512775182723999, "kl": 0.03028771048411727, "learning_rate": 4.676666666666667e-06, "loss": 0.0015, "num_tokens": 485610.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 29.333333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.07810891419649124, "kl": 0.00897173467092216, "learning_rate": 4.676111111111111e-06, "loss": 0.0005, "num_tokens": 485943.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 29.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.22176046669483185, "kl": 0.1552755981683731, "learning_rate": 4.675555555555556e-06, "loss": 0.0076, "num_tokens": 486259.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 29.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10077960789203644, "kl": 0.09424827806651592, "learning_rate": 4.675000000000001e-06, "loss": 0.0047, "num_tokens": 486555.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 29.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.17300434410572052, "kl": 0.1194087453186512, "learning_rate": 4.6744444444444445e-06, "loss": 0.006, "num_tokens": 486911.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1587 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.010869565419852734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010869565419852734, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 29.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 6.570488452911377, "kl": 0.05348486453294754, "learning_rate": 4.67388888888889e-06, "loss": 0.0784, "num_tokens": 487231.0, "reward": 6.625, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.0966243743896484, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 29.425925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 5.150686740875244, "kl": 0.02815502928569913, "learning_rate": 4.673333333333333e-06, "loss": -0.0185, "num_tokens": 487520.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 29.444444444444443, "frac_reward_zero_std": 0.0, "grad_norm": 3.679452896118164, "kl": 0.17078236490488052, "learning_rate": 4.672777777777778e-06, "loss": 0.0672, "num_tokens": 487852.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 29.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.1983886957168579, "kl": 0.018841517332475632, "learning_rate": 4.672222222222223e-06, "loss": 0.0009, "num_tokens": 488108.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 29.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03543396666646004, "kl": 0.014736307319253683, "learning_rate": 4.671666666666667e-06, "loss": 0.0007, "num_tokens": 488380.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 29.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.15950022637844086, "kl": 0.013705291785299778, "learning_rate": 4.6711111111111115e-06, "loss": 0.0007, "num_tokens": 488638.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 29.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.014986110851168633, "kl": 0.03225737810134888, "learning_rate": 4.670555555555556e-06, "loss": 0.0016, "num_tokens": 488854.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 29.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.18729066848754883, "kl": 0.03868412598967552, "learning_rate": 4.670000000000001e-06, "loss": 0.0019, "num_tokens": 489168.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 29.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007123021641746163, "kl": 2.9481947422027588e-05, "learning_rate": 4.6694444444444445e-06, "loss": 0.0, "num_tokens": 489388.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 29.574074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.3491134643554688, "kl": 0.042570777237415314, "learning_rate": 4.66888888888889e-06, "loss": -0.0036, "num_tokens": 489732.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 1597 }, { "clip_ratio/high_max": 0.011111111380159855, "clip_ratio/high_mean": 0.011111111380159855, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 29.59259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 4.084628105163574, "kl": 0.04290131293237209, "learning_rate": 4.668333333333333e-06, "loss": -0.0784, "num_tokens": 490059.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 29.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.000804347509983927, "kl": 0.018587633036077023, "learning_rate": 4.6677777777777785e-06, "loss": 0.0009, "num_tokens": 490319.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 29.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034211771562695503, "kl": 0.00315287709236145, "learning_rate": 4.667222222222223e-06, "loss": 0.0002, "num_tokens": 490539.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 29.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 6.980684757232666, "kl": 0.2403966225683689, "learning_rate": 4.666666666666667e-06, "loss": -0.0179, "num_tokens": 490798.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 29.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.1412925124168396, "kl": 0.0468850564211607, "learning_rate": 4.6661111111111115e-06, "loss": 0.0023, "num_tokens": 491130.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 29.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.12599427998065948, "kl": 0.09101472049951553, "learning_rate": 4.665555555555556e-06, "loss": 0.0046, "num_tokens": 491402.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 29.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0775795727968216, "kl": 0.021360242273658514, "learning_rate": 4.665e-06, "loss": 0.001, "num_tokens": 491716.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 29.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.3650407791137695, "kl": 0.04723835736513138, "learning_rate": 4.664444444444445e-06, "loss": 0.0228, "num_tokens": 492068.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 29.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01850694604218006, "kl": 0.01235404284670949, "learning_rate": 4.66388888888889e-06, "loss": 0.0006, "num_tokens": 492340.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 29.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.22231703996658325, "kl": 0.03603258868679404, "learning_rate": 4.663333333333333e-06, "loss": 0.0017, "num_tokens": 492596.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 29.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.025749903172254562, "kl": 0.016232634894549847, "learning_rate": 4.6627777777777785e-06, "loss": 0.0008, "num_tokens": 492888.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 29.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0317452996969223, "kl": 0.00452280358877033, "learning_rate": 4.662222222222222e-06, "loss": 0.0002, "num_tokens": 493154.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 29.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.03965836018323898, "kl": 0.005493992357514799, "learning_rate": 4.661666666666667e-06, "loss": 0.0003, "num_tokens": 493431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 29.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.20968350768089294, "kl": 0.013462136499583721, "learning_rate": 4.6611111111111116e-06, "loss": 0.0007, "num_tokens": 493640.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 29.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.608407497406006, "kl": 0.037865517660975456, "learning_rate": 4.660555555555556e-06, "loss": 0.1011, "num_tokens": 493982.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 29.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0723642036318779, "kl": 0.011732339393347502, "learning_rate": 4.66e-06, "loss": 0.0006, "num_tokens": 494264.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 29.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.10431905835866928, "kl": 0.13544725626707077, "learning_rate": 4.659444444444445e-06, "loss": 0.0068, "num_tokens": 494566.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 29.90740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.1556599885225296, "kl": 0.04082600772380829, "learning_rate": 4.65888888888889e-06, "loss": 0.002, "num_tokens": 494834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 29.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 11.551952362060547, "kl": 0.005519214319065213, "learning_rate": 4.658333333333333e-06, "loss": 0.0598, "num_tokens": 495071.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 29.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.004422692582011223, "kl": 0.08987029269337654, "learning_rate": 4.6577777777777785e-06, "loss": 0.0045, "num_tokens": 495435.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 29.962962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0050407410599291325, "kl": 0.003551547648385167, "learning_rate": 4.657222222222222e-06, "loss": 0.0002, "num_tokens": 495747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 29.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.008821951225399971, "kl": 0.0022641271352767944, "learning_rate": 4.656666666666667e-06, "loss": 0.0001, "num_tokens": 495959.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038868843112140894, "kl": 0.0034947991371154785, "learning_rate": 4.656111111111112e-06, "loss": 0.0002, "num_tokens": 496255.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 30.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.8526206016540527, "kl": 0.1915391981601715, "learning_rate": 4.655555555555556e-06, "loss": 0.0312, "num_tokens": 496587.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 30.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.12061259895563126, "kl": 0.009672592859715223, "learning_rate": 4.655e-06, "loss": 0.0005, "num_tokens": 496862.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.25382912158966064, "kl": 0.048417724668979645, "learning_rate": 4.654444444444445e-06, "loss": 0.0025, "num_tokens": 497135.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1623 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 30.074074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.936505079269409, "kl": 0.1631486602127552, "learning_rate": 4.653888888888889e-06, "loss": 0.236, "num_tokens": 497465.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 30.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.367085337638855, "kl": 0.04197363555431366, "learning_rate": 4.653333333333333e-06, "loss": 0.0025, "num_tokens": 497672.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 30.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004340685438364744, "kl": 0.08990556746721268, "learning_rate": 4.652777777777779e-06, "loss": 0.0045, "num_tokens": 498036.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 30.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.226398229598999, "kl": 0.030453883111476898, "learning_rate": 4.652222222222222e-06, "loss": 0.0933, "num_tokens": 498406.0, "reward": 4.75, "reward_std": 5.5, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 5.5, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008466118946671486, "kl": 0.0025333832018077374, "learning_rate": 4.651666666666667e-06, "loss": 0.0001, "num_tokens": 498690.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 30.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006495642592199147, "kl": 2.5294721126556396e-05, "learning_rate": 4.651111111111112e-06, "loss": 0.0, "num_tokens": 498910.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 30.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.036879945546388626, "kl": 0.010825079400092363, "learning_rate": 4.650555555555556e-06, "loss": 0.0005, "num_tokens": 499198.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.203703703703702, "frac_reward_zero_std": 0.0, "grad_norm": 7.663207530975342, "kl": 0.02379700861638412, "learning_rate": 4.65e-06, "loss": 0.3372, "num_tokens": 499513.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 30.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.04137452691793442, "kl": 0.02038796991109848, "learning_rate": 4.649444444444445e-06, "loss": 0.001, "num_tokens": 499809.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 30.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014994542580097914, "kl": 0.00010207742525381036, "learning_rate": 4.648888888888889e-06, "loss": 0.0, "num_tokens": 500079.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 30.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.004904774017632008, "kl": 0.0002994835376739502, "learning_rate": 4.6483333333333334e-06, "loss": 0.0, "num_tokens": 500291.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 30.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.21106386184692383, "kl": 0.02048715204000473, "learning_rate": 4.647777777777779e-06, "loss": 0.001, "num_tokens": 500551.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 30.296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.002877729944884777, "kl": 0.0033727765548974276, "learning_rate": 4.647222222222222e-06, "loss": 0.0002, "num_tokens": 500771.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 30.314814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.03615758940577507, "kl": 0.004237005021423101, "learning_rate": 4.646666666666667e-06, "loss": 0.0002, "num_tokens": 501039.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.333333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 4.63889741897583, "kl": 0.017513833940029144, "learning_rate": 4.646111111111111e-06, "loss": -0.0915, "num_tokens": 501317.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013513513840734959, "clip_ratio/low_min": 0.013513513840734959, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 30.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.964344024658203, "kl": 0.08402857184410095, "learning_rate": 4.645555555555556e-06, "loss": -0.1437, "num_tokens": 501630.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 30.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0694352462887764, "kl": 0.006220644805580378, "learning_rate": 4.645e-06, "loss": 0.0003, "num_tokens": 501873.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 30.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.498944282531738, "kl": 0.13281813263893127, "learning_rate": 4.644444444444445e-06, "loss": -0.0553, "num_tokens": 502181.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 30.40740740740741, "frac_reward_zero_std": 1.0, "grad_norm": 0.002554322360083461, "kl": 0.011056587100028992, "learning_rate": 4.643888888888889e-06, "loss": 0.0006, "num_tokens": 502417.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 30.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.22775501012802124, "kl": 0.02651564870029688, "learning_rate": 4.6433333333333335e-06, "loss": 0.0014, "num_tokens": 502680.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 30.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.059793487191200256, "kl": 0.0015301525709219277, "learning_rate": 4.642777777777779e-06, "loss": 0.0001, "num_tokens": 502936.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 30.462962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.1518106311559677, "kl": 0.028103399090468884, "learning_rate": 4.642222222222222e-06, "loss": 0.0014, "num_tokens": 503268.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 30.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04031050205230713, "kl": 0.049436675384640694, "learning_rate": 4.641666666666667e-06, "loss": 0.0025, "num_tokens": 503562.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07616262137889862, "kl": 0.03286079317331314, "learning_rate": 4.641111111111111e-06, "loss": 0.0016, "num_tokens": 503830.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 30.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009053540416061878, "kl": 0.002137616276741028, "learning_rate": 4.640555555555556e-06, "loss": 0.0001, "num_tokens": 504042.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 30.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.012759193778038025, "kl": 0.01270288648083806, "learning_rate": 4.6400000000000005e-06, "loss": 0.0006, "num_tokens": 504278.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1649 }, { "clip_ratio/high_max": 0.010204081423580647, "clip_ratio/high_mean": 0.010204081423580647, "clip_ratio/low_mean": 0.01315789483487606, "clip_ratio/low_min": 0.01315789483487606, "clip_ratio/region_mean": 0.023361976258456707, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 30.555555555555557, "frac_reward_zero_std": 0.0, "grad_norm": 3.7974088191986084, "kl": 0.09364419430494308, "learning_rate": 4.639444444444445e-06, "loss": 0.0328, "num_tokens": 504585.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 30.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.15516987442970276, "kl": 0.13953964412212372, "learning_rate": 4.638888888888889e-06, "loss": 0.007, "num_tokens": 504911.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 30.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.2140371948480606, "kl": 0.03296723170205951, "learning_rate": 4.6383333333333335e-06, "loss": 0.0016, "num_tokens": 505233.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 30.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.21111167967319489, "kl": 0.018634764943271875, "learning_rate": 4.637777777777778e-06, "loss": 0.0009, "num_tokens": 505568.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 30.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07057809829711914, "kl": 0.10635381564497948, "learning_rate": 4.637222222222222e-06, "loss": 0.0053, "num_tokens": 505920.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 30.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.41775527596473694, "kl": 0.0926544014364481, "learning_rate": 4.6366666666666674e-06, "loss": 0.0046, "num_tokens": 506220.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 30.666666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.18610379099845886, "kl": 0.0140237957239151, "learning_rate": 4.636111111111111e-06, "loss": 0.0007, "num_tokens": 506468.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.01999879814684391, "kl": 0.0036700874334201217, "learning_rate": 4.635555555555556e-06, "loss": 0.0002, "num_tokens": 506768.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 30.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 2.9753313064575195, "kl": 0.25645938562229276, "learning_rate": 4.6350000000000005e-06, "loss": 0.0118, "num_tokens": 507068.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 30.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.04412281513214111, "kl": 0.003612235188484192, "learning_rate": 4.634444444444445e-06, "loss": 0.0002, "num_tokens": 507324.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 30.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.38990476727485657, "kl": 0.06865782849490643, "learning_rate": 4.633888888888889e-06, "loss": 0.0035, "num_tokens": 507592.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 30.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016786116175353527, "kl": 0.0184159018099308, "learning_rate": 4.633333333333334e-06, "loss": 0.0009, "num_tokens": 507852.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1661 }, { "clip_ratio/high_max": 0.006849315017461777, "clip_ratio/high_mean": 0.006849315017461777, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006849315017461777, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 30.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 4.098377704620361, "kl": 0.10727032646536827, "learning_rate": 4.632777777777778e-06, "loss": -0.1229, "num_tokens": 508208.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.796296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 4.988539695739746, "kl": 0.013883250998333097, "learning_rate": 4.632222222222222e-06, "loss": -0.0322, "num_tokens": 508490.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 30.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.05296746641397476, "kl": 0.003398641012609005, "learning_rate": 4.6316666666666675e-06, "loss": 0.0002, "num_tokens": 508813.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 30.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.02060055360198021, "kl": 0.21815308928489685, "learning_rate": 4.631111111111111e-06, "loss": 0.0109, "num_tokens": 509117.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 30.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.258530378341675, "kl": 0.03443282097578049, "learning_rate": 4.630555555555556e-06, "loss": 0.0269, "num_tokens": 509453.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 30.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11759570986032486, "kl": 0.037995487451553345, "learning_rate": 4.6300000000000006e-06, "loss": 0.0019, "num_tokens": 509783.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 30.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.008659024722874165, "kl": 0.005731015698984265, "learning_rate": 4.629444444444445e-06, "loss": 0.0003, "num_tokens": 510075.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 30.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 3.3894710540771484, "kl": 0.04282430745661259, "learning_rate": 4.628888888888889e-06, "loss": 0.0043, "num_tokens": 510399.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1669 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.012820512987673283, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 30.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 5.079377174377441, "kl": 0.2345985472202301, "learning_rate": 4.628333333333334e-06, "loss": -0.0033, "num_tokens": 510703.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 30.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.027698596939444542, "kl": 0.0543022695928812, "learning_rate": 4.627777777777778e-06, "loss": 0.0027, "num_tokens": 511166.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 76.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 30.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 1.4590680599212646, "kl": 0.008416782598942518, "learning_rate": 4.627222222222222e-06, "loss": 0.442, "num_tokens": 511718.0, "reward": 2.549999952316284, "reward_std": 1.899999976158142, "rewards/reward_combined/mean": 2.549999952316284, "rewards/reward_combined/std": 1.899999976158142, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 30.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.015182198025286198, "kl": 0.030726537108421326, "learning_rate": 4.626666666666667e-06, "loss": 0.0015, "num_tokens": 511934.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 58.5, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 31.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.11166021972894669, "kl": 0.02912729699164629, "learning_rate": 4.626111111111111e-06, "loss": 0.0008, "num_tokens": 512388.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 90.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 35.66666793823242, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 31.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.5921870470046997, "kl": 0.09666461311280727, "learning_rate": 4.625555555555556e-06, "loss": 0.1995, "num_tokens": 512975.0, "reward": 3.299999952316284, "reward_std": 3.2031235694885254, "rewards/reward_combined/mean": 3.299999952316284, "rewards/reward_combined/std": 3.2031233310699463, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 31.037037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044399588368833065, "kl": 0.08989174664020538, "learning_rate": 4.625000000000001e-06, "loss": 0.0045, "num_tokens": 513339.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 31.055555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.3858907222747803, "kl": 0.28700022399425507, "learning_rate": 4.624444444444445e-06, "loss": 0.0144, "num_tokens": 513643.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 31.074074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.825772285461426, "kl": 0.1042183879762888, "learning_rate": 4.623888888888889e-06, "loss": -0.0197, "num_tokens": 513973.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 31.09259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.025183819234371185, "kl": 0.006186964106746018, "learning_rate": 4.623333333333334e-06, "loss": 0.0003, "num_tokens": 514261.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 31.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.009446508251130581, "kl": 0.0014360398054122925, "learning_rate": 4.622777777777778e-06, "loss": 0.0001, "num_tokens": 514473.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 31.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.258073329925537, "kl": 0.04892086237668991, "learning_rate": 4.622222222222222e-06, "loss": 0.0428, "num_tokens": 514796.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 31.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.9460673332214355, "kl": 0.02885538199916482, "learning_rate": 4.621666666666667e-06, "loss": 0.1455, "num_tokens": 515100.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 31.166666666666668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004609346215147525, "kl": 1.6354024410247803e-05, "learning_rate": 4.621111111111111e-06, "loss": 0.0, "num_tokens": 515320.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 31.185185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.06466642767190933, "kl": 0.03135494142770767, "learning_rate": 4.620555555555556e-06, "loss": 0.0016, "num_tokens": 515536.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 31.203703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.18326666951179504, "kl": 0.017277836799621582, "learning_rate": 4.620000000000001e-06, "loss": 0.0009, "num_tokens": 515796.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1685 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completion_length": 46.75, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 31.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.811213493347168, "kl": 0.06596019119024277, "learning_rate": 4.619444444444445e-06, "loss": -0.0879, "num_tokens": 516207.0, "reward": 5.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.34165620803833, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04865988716483116, "kl": 0.003214521799236536, "learning_rate": 4.618888888888889e-06, "loss": 0.0002, "num_tokens": 516475.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 31.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.13110439479351044, "kl": 0.035155363380908966, "learning_rate": 4.618333333333334e-06, "loss": 0.0018, "num_tokens": 516744.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 31.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020035000052303076, "kl": 0.018368509598076344, "learning_rate": 4.617777777777778e-06, "loss": 0.0009, "num_tokens": 517004.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 31.296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 2.0478789806365967, "kl": 0.09045041911303997, "learning_rate": 4.6172222222222224e-06, "loss": -0.0269, "num_tokens": 517451.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.314814814814813, "frac_reward_zero_std": 0.0, "grad_norm": 4.979264259338379, "kl": 0.03822304308414459, "learning_rate": 4.616666666666667e-06, "loss": 0.2761, "num_tokens": 517734.0, "reward": 6.5, "reward_std": 1.154700517654419, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 1.154700517654419, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 31.333333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 2.7016420364379883, "kl": 0.03165926691144705, "learning_rate": 4.616111111111112e-06, "loss": 0.0821, "num_tokens": 518011.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 31.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01204725168645382, "kl": 0.03354608826339245, "learning_rate": 4.6155555555555555e-06, "loss": 0.0017, "num_tokens": 518279.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 31.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035188416950404644, "kl": 0.003091919468715787, "learning_rate": 4.615000000000001e-06, "loss": 0.0002, "num_tokens": 518499.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 31.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014806741382926702, "kl": 0.011214002966880798, "learning_rate": 4.614444444444445e-06, "loss": 0.0006, "num_tokens": 518735.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 31.40740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 4.165841579437256, "kl": 0.11836159974336624, "learning_rate": 4.613888888888889e-06, "loss": -0.0538, "num_tokens": 519098.0, "reward": 3.375, "reward_std": 0.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 0.25, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 31.425925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.14219821989536285, "kl": 0.010300129652023315, "learning_rate": 4.613333333333334e-06, "loss": 0.0005, "num_tokens": 519304.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 31.444444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.13890843093395233, "kl": 0.028576523065567017, "learning_rate": 4.612777777777778e-06, "loss": 0.0015, "num_tokens": 519622.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 31.462962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.6153411865234375, "kl": 0.18784934654831886, "learning_rate": 4.6122222222222225e-06, "loss": -0.0048, "num_tokens": 519949.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 31.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.1438974142074585, "kl": 0.014405475463718176, "learning_rate": 4.611666666666667e-06, "loss": 0.0008, "num_tokens": 520263.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 31.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.156638503074646, "kl": 0.012206172570586205, "learning_rate": 4.611111111111112e-06, "loss": 0.0006, "num_tokens": 520525.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 31.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.437230110168457, "kl": 0.061550937592983246, "learning_rate": 4.6105555555555556e-06, "loss": -0.1008, "num_tokens": 520809.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 31.537037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.009305350482463837, "kl": 0.0044992255279794335, "learning_rate": 4.610000000000001e-06, "loss": 0.0002, "num_tokens": 521109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 31.555555555555557, "frac_reward_zero_std": 1.0, "grad_norm": 0.03650277853012085, "kl": 0.007688512559980154, "learning_rate": 4.609444444444445e-06, "loss": 0.0004, "num_tokens": 521389.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 31.574074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.1966223269701004, "kl": 0.09185861423611641, "learning_rate": 4.6088888888888895e-06, "loss": 0.0043, "num_tokens": 521705.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 31.59259259259259, "frac_reward_zero_std": 1.0, "grad_norm": 0.16613268852233887, "kl": 0.08781424164772034, "learning_rate": 4.608333333333334e-06, "loss": 0.0044, "num_tokens": 522001.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 31.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.035715583711862564, "kl": 0.006223449774552137, "learning_rate": 4.607777777777778e-06, "loss": 0.0004, "num_tokens": 522265.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 31.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02597753517329693, "kl": 0.009900204837322235, "learning_rate": 4.6072222222222225e-06, "loss": 0.0005, "num_tokens": 522577.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1098671406507492, "kl": 0.008122900209855288, "learning_rate": 4.606666666666667e-06, "loss": 0.0004, "num_tokens": 522833.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 31.666666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 4.134155750274658, "kl": 0.1468386948108673, "learning_rate": 4.606111111111112e-06, "loss": 0.0088, "num_tokens": 523180.0, "reward": 4.25, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 2.1794495582580566, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 31.685185185185187, "frac_reward_zero_std": 1.0, "grad_norm": 0.06237058341503143, "kl": 0.04026004299521446, "learning_rate": 4.605555555555556e-06, "loss": 0.002, "num_tokens": 523512.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 31.703703703703702, "frac_reward_zero_std": 1.0, "grad_norm": 0.06679429858922958, "kl": 0.0058994959108531475, "learning_rate": 4.605000000000001e-06, "loss": 0.0003, "num_tokens": 523841.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 31.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.1449667066335678, "kl": 0.02989705093204975, "learning_rate": 4.604444444444444e-06, "loss": 0.0015, "num_tokens": 524136.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 31.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.6607390642166138, "kl": 0.10027143359184265, "learning_rate": 4.6038888888888895e-06, "loss": 0.0051, "num_tokens": 524472.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 31.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.0567402839660645, "kl": 0.2537401616573334, "learning_rate": 4.603333333333334e-06, "loss": 0.2118, "num_tokens": 524803.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 31.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.04071876406669617, "kl": 0.005550673115067184, "learning_rate": 4.602777777777778e-06, "loss": 0.0003, "num_tokens": 525089.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 31.796296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.08567607402801514, "kl": 0.011332066729664803, "learning_rate": 4.602222222222223e-06, "loss": 0.0006, "num_tokens": 525391.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 31.814814814814813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0802842229604721, "kl": 0.005565584870055318, "learning_rate": 4.601666666666667e-06, "loss": 0.0003, "num_tokens": 525655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 31.833333333333332, "frac_reward_zero_std": 1.0, "grad_norm": 0.012097869999706745, "kl": 0.0009144656360149384, "learning_rate": 4.601111111111112e-06, "loss": 0.0, "num_tokens": 525899.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08861463516950607, "kl": 0.012803372461348772, "learning_rate": 4.600555555555556e-06, "loss": 0.0006, "num_tokens": 526170.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 31.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04879658296704292, "kl": 0.007392128696665168, "learning_rate": 4.600000000000001e-06, "loss": 0.0004, "num_tokens": 526446.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 31.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.15244020521640778, "kl": 0.014907035045325756, "learning_rate": 4.599444444444444e-06, "loss": 0.0008, "num_tokens": 526659.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 31.90740740740741, "frac_reward_zero_std": 0.0, "grad_norm": 14.625046730041504, "kl": 0.009605729777831584, "learning_rate": 4.5988888888888896e-06, "loss": 0.0004, "num_tokens": 526919.0, "reward": 2.25, "reward_std": 2.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 2.5, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 31.925925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 4.99356746673584, "kl": 0.1416907235980034, "learning_rate": 4.598333333333334e-06, "loss": 0.2564, "num_tokens": 527259.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 31.944444444444443, "frac_reward_zero_std": 1.0, "grad_norm": 0.011436820030212402, "kl": 0.013168424367904663, "learning_rate": 4.597777777777778e-06, "loss": 0.0007, "num_tokens": 527495.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 31.962962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 6.399114608764648, "kl": 0.03958158753812313, "learning_rate": 4.597222222222223e-06, "loss": 0.0339, "num_tokens": 527834.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 31.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.027128493413329124, "kl": 0.0033853623317554593, "learning_rate": 4.596666666666667e-06, "loss": 0.0002, "num_tokens": 528116.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09736764430999756, "kl": 0.037935562431812286, "learning_rate": 4.596111111111111e-06, "loss": 0.0019, "num_tokens": 528408.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 32.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09225521981716156, "kl": 0.011160872876644135, "learning_rate": 4.595555555555556e-06, "loss": 0.0005, "num_tokens": 528701.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 32.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05267498642206192, "kl": 0.0009283125400543213, "learning_rate": 4.595000000000001e-06, "loss": 0.0, "num_tokens": 528913.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 32.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.127925395965576, "kl": 0.05153932236135006, "learning_rate": 4.594444444444444e-06, "loss": 0.0326, "num_tokens": 529207.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 32.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.008587763644754887, "kl": 0.0008292645215988159, "learning_rate": 4.59388888888889e-06, "loss": 0.0, "num_tokens": 529451.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 32.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 1.9628651142120361, "kl": 0.08699945360422134, "learning_rate": 4.593333333333333e-06, "loss": -0.3047, "num_tokens": 529869.0, "reward": 5.875, "reward_std": 3.326033592224121, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.326033592224121, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 32.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.02756941318511963, "kl": 0.031238175928592682, "learning_rate": 4.592777777777778e-06, "loss": 0.0016, "num_tokens": 530085.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 32.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.26704564690589905, "kl": 0.017246655479539186, "learning_rate": 4.592222222222223e-06, "loss": 0.0012, "num_tokens": 530359.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 32.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.03889984264969826, "kl": 0.00574718345887959, "learning_rate": 4.591666666666667e-06, "loss": 0.0003, "num_tokens": 530635.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 32.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 2.9940693378448486, "kl": 0.13697947934269905, "learning_rate": 4.591111111111111e-06, "loss": 0.0212, "num_tokens": 530954.0, "reward": 2.0, "reward_std": 2.4494898319244385, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 2.4494898319244385, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 32.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001352473336737603, "kl": 4.366040229797363e-06, "learning_rate": 4.590555555555556e-06, "loss": 0.0, "num_tokens": 531174.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 32.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.010737295262515545, "kl": 0.013351525645703077, "learning_rate": 4.590000000000001e-06, "loss": 0.0007, "num_tokens": 531410.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 32.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.03442300483584404, "kl": 0.04089740291237831, "learning_rate": 4.5894444444444445e-06, "loss": 0.002, "num_tokens": 531730.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 32.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005330985877662897, "kl": 0.03149745240807533, "learning_rate": 4.58888888888889e-06, "loss": 0.0016, "num_tokens": 531998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 32.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.15124620497226715, "kl": 0.042321259155869484, "learning_rate": 4.588333333333333e-06, "loss": 0.0023, "num_tokens": 532298.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.026008371263742447, "kl": 0.022576000541448593, "learning_rate": 4.587777777777778e-06, "loss": 0.0012, "num_tokens": 532587.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 32.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.003949982114136219, "kl": 0.09000347927212715, "learning_rate": 4.587222222222223e-06, "loss": 0.0045, "num_tokens": 532951.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 32.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.025523478165268898, "kl": 0.21571629494428635, "learning_rate": 4.586666666666667e-06, "loss": 0.0108, "num_tokens": 533255.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 32.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.05372706428170204, "kl": 0.0009036928531713784, "learning_rate": 4.5861111111111114e-06, "loss": 0.0, "num_tokens": 533503.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 32.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.23930537700653076, "kl": 0.10073016956448555, "learning_rate": 4.585555555555556e-06, "loss": 0.005, "num_tokens": 533855.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 32.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 3.902867078781128, "kl": 0.19815582036972046, "learning_rate": 4.585e-06, "loss": 0.2497, "num_tokens": 534257.0, "reward": 2.375, "reward_std": 2.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 2.25, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.03646647185087204, "kl": 0.006142941070720553, "learning_rate": 4.5844444444444445e-06, "loss": 0.0003, "num_tokens": 534557.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 32.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.027434686198830605, "kl": 0.060694847255945206, "learning_rate": 4.58388888888889e-06, "loss": 0.003, "num_tokens": 535021.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 32.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 3.1358354091644287, "kl": 0.0736173614859581, "learning_rate": 4.583333333333333e-06, "loss": 0.0066, "num_tokens": 535367.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 32.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0555155873298645, "kl": 0.033579835668206215, "learning_rate": 4.582777777777778e-06, "loss": 0.0017, "num_tokens": 535699.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 32.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.18175001442432404, "kl": 0.020103135146200657, "learning_rate": 4.582222222222223e-06, "loss": 0.0011, "num_tokens": 535989.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 32.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02175629325211048, "kl": 0.012958759441971779, "learning_rate": 4.581666666666667e-06, "loss": 0.0006, "num_tokens": 536301.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.026116888970136642, "kl": 0.005276753334328532, "learning_rate": 4.5811111111111115e-06, "loss": 0.0003, "num_tokens": 536601.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 32.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.006749274674803019, "kl": 0.000292852520942688, "learning_rate": 4.580555555555556e-06, "loss": 0.0, "num_tokens": 536813.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 32.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.12619085609912872, "kl": 0.026050676591694355, "learning_rate": 4.58e-06, "loss": 0.0013, "num_tokens": 537131.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 32.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.19865597784519196, "kl": 0.05396121181547642, "learning_rate": 4.5794444444444446e-06, "loss": 0.0025, "num_tokens": 537469.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 32.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.18297921121120453, "kl": 0.014537587761878967, "learning_rate": 4.57888888888889e-06, "loss": 0.001, "num_tokens": 537696.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 32.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.6117445230484009, "kl": 0.08854137361049652, "learning_rate": 4.578333333333333e-06, "loss": 0.0057, "num_tokens": 537974.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 32.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.17570602893829346, "kl": 0.03480658563785255, "learning_rate": 4.5777777777777785e-06, "loss": 0.0019, "num_tokens": 538244.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 32.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.001677262014709413, "kl": 0.011176548898220062, "learning_rate": 4.577222222222222e-06, "loss": 0.0006, "num_tokens": 538480.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 32.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 2.148606061935425, "kl": 0.41938649863004684, "learning_rate": 4.576666666666667e-06, "loss": 0.0195, "num_tokens": 538809.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 32.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.3813815414905548, "kl": 0.17621751129627228, "learning_rate": 4.5761111111111115e-06, "loss": 0.0089, "num_tokens": 539158.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 32.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.021072592586278915, "kl": 0.0029688880313187838, "learning_rate": 4.575555555555556e-06, "loss": 0.0001, "num_tokens": 539435.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 32.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.12732945382595062, "kl": 0.03218704368919134, "learning_rate": 4.575e-06, "loss": 0.0016, "num_tokens": 539769.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 32.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.021309398114681244, "kl": 0.0015784281422384083, "learning_rate": 4.574444444444445e-06, "loss": 0.0001, "num_tokens": 540027.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 32.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05123296007514, "kl": 0.10556807368993759, "learning_rate": 4.57388888888889e-06, "loss": 0.0052, "num_tokens": 540297.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.788128852844238, "kl": 0.05937246419489384, "learning_rate": 4.573333333333333e-06, "loss": 0.2527, "num_tokens": 540610.0, "reward": 5.0, "reward_std": 5.338539123535156, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 5.338539123535156, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 32.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.021466707810759544, "kl": 0.00184092135168612, "learning_rate": 4.5727777777777785e-06, "loss": 0.0001, "num_tokens": 540934.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 32.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0202710572630167, "kl": 0.002663541352376342, "learning_rate": 4.572222222222222e-06, "loss": 0.0001, "num_tokens": 541218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 32.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 3.6179428100585938, "kl": 0.19780117645859718, "learning_rate": 4.571666666666667e-06, "loss": 0.005, "num_tokens": 541524.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 32.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.003268477274104953, "kl": 0.018162868916988373, "learning_rate": 4.571111111111112e-06, "loss": 0.0009, "num_tokens": 541784.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009433962404727936, "clip_ratio/low_min": 0.009433962404727936, "clip_ratio/region_mean": 0.009433962404727936, "completion_length": 83.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 83.25, "completions/mean_terminated_length": 25.666667938232422, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 32.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 2.027712106704712, "kl": 0.08392906561493874, "learning_rate": 4.570555555555556e-06, "loss": 0.4719, "num_tokens": 542341.0, "reward": 3.25, "reward_std": 5.057997226715088, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 5.057997226715088, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 32.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.14236770570278168, "kl": 0.059472810477018356, "learning_rate": 4.57e-06, "loss": 0.0032, "num_tokens": 542662.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 32.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 6.700425148010254, "kl": 0.011621040874160826, "learning_rate": 4.569444444444445e-06, "loss": 0.0337, "num_tokens": 542976.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 32.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 5.689311504364014, "kl": 0.06816429644823074, "learning_rate": 4.568888888888889e-06, "loss": 0.3284, "num_tokens": 543263.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 32.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.5365185737609863, "kl": 0.054264222853817046, "learning_rate": 4.568333333333333e-06, "loss": 0.0031, "num_tokens": 543485.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 32.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.08230657875537872, "kl": 0.004137760493904352, "learning_rate": 4.5677777777777786e-06, "loss": 0.0002, "num_tokens": 543741.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 32.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05956551805138588, "kl": 0.0015111267566680908, "learning_rate": 4.567222222222222e-06, "loss": 0.0001, "num_tokens": 544009.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 32.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.075844764709473, "kl": 0.15768632292747498, "learning_rate": 4.566666666666667e-06, "loss": 0.1647, "num_tokens": 544333.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.07092377543449402, "kl": 0.012648920994251966, "learning_rate": 4.566111111111112e-06, "loss": 0.0006, "num_tokens": 544606.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 33.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.029348373413085938, "kl": 0.03527802135795355, "learning_rate": 4.565555555555556e-06, "loss": 0.0018, "num_tokens": 544878.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 33.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04329697787761688, "kl": 0.1407695822417736, "learning_rate": 4.565e-06, "loss": 0.0071, "num_tokens": 545187.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 33.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.004675895906984806, "kl": 0.00014518201351165771, "learning_rate": 4.564444444444445e-06, "loss": 0.0, "num_tokens": 545399.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 33.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.15131975710391998, "kl": 0.23851999640464783, "learning_rate": 4.563888888888889e-06, "loss": 0.0118, "num_tokens": 545701.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 33.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.20012055337429047, "kl": 0.015992484986782074, "learning_rate": 4.563333333333333e-06, "loss": 0.0009, "num_tokens": 545957.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 33.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.06897109001874924, "kl": 0.00356253981590271, "learning_rate": 4.562777777777779e-06, "loss": 0.0002, "num_tokens": 546217.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 33.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.04659740999341011, "kl": 0.006986615248024464, "learning_rate": 4.562222222222222e-06, "loss": 0.0003, "num_tokens": 546517.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 33.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014680225867778063, "kl": 0.011166132986545563, "learning_rate": 4.561666666666667e-06, "loss": 0.0006, "num_tokens": 546753.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 33.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.033093180507421494, "kl": 0.03130766749382019, "learning_rate": 4.561111111111112e-06, "loss": 0.0016, "num_tokens": 546969.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 33.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.19867372512817383, "kl": 0.08581119030714035, "learning_rate": 4.560555555555556e-06, "loss": 0.0042, "num_tokens": 547415.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 33.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0924038365483284, "kl": 0.0015111491084098816, "learning_rate": 4.56e-06, "loss": 0.0001, "num_tokens": 547627.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 33.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.009577823802828789, "kl": 0.0019631253089755774, "learning_rate": 4.559444444444445e-06, "loss": 0.0001, "num_tokens": 547939.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.956066846847534, "kl": 0.12626105919480324, "learning_rate": 4.558888888888889e-06, "loss": 0.173, "num_tokens": 548218.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 33.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 0.843658983707428, "kl": 0.12953884899616241, "learning_rate": 4.5583333333333335e-06, "loss": -0.0043, "num_tokens": 548581.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 33.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 4.758306980133057, "kl": 0.07609960436820984, "learning_rate": 4.557777777777778e-06, "loss": -0.036, "num_tokens": 548885.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 33.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.22762145102024078, "kl": 0.19094975292682648, "learning_rate": 4.557222222222222e-06, "loss": 0.0096, "num_tokens": 549213.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.258441925048828, "kl": 0.10703568905591965, "learning_rate": 4.556666666666667e-06, "loss": 0.0825, "num_tokens": 549538.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 33.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.1428941786289215, "kl": 0.08193885907530785, "learning_rate": 4.556111111111112e-06, "loss": 0.0041, "num_tokens": 549840.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 33.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 4.668575286865234, "kl": 0.062431950122117996, "learning_rate": 4.555555555555556e-06, "loss": 0.1516, "num_tokens": 550168.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 33.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026484194677323103, "kl": 0.018257787451148033, "learning_rate": 4.5550000000000004e-06, "loss": 0.0009, "num_tokens": 550428.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 33.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0683375746011734, "kl": 0.0020206660956318956, "learning_rate": 4.554444444444445e-06, "loss": 0.0001, "num_tokens": 550684.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 33.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 4.1799821853637695, "kl": 0.015828561037778854, "learning_rate": 4.553888888888889e-06, "loss": 0.0684, "num_tokens": 550977.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 33.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.054885394871234894, "kl": 0.03551517799496651, "learning_rate": 4.5533333333333335e-06, "loss": 0.0018, "num_tokens": 551269.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 33.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09110985696315765, "kl": 0.004362988285720348, "learning_rate": 4.552777777777778e-06, "loss": 0.0002, "num_tokens": 551476.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 33.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.002290858654305339, "kl": 0.002392554306425154, "learning_rate": 4.552222222222222e-06, "loss": 0.0001, "num_tokens": 551696.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 33.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.017489047721028328, "kl": 0.0030585642671212554, "learning_rate": 4.551666666666667e-06, "loss": 0.0002, "num_tokens": 551980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 33.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.36233827471733093, "kl": 0.08540670201182365, "learning_rate": 4.551111111111112e-06, "loss": 0.0044, "num_tokens": 552307.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 33.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.324534893035889, "kl": 0.10989563912153244, "learning_rate": 4.550555555555556e-06, "loss": 0.0516, "num_tokens": 552652.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 74.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 74.75, "completions/mean_terminated_length": 14.333333969116211, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.359931945800781, "kl": 0.020933632273226976, "learning_rate": 4.5500000000000005e-06, "loss": 0.1829, "num_tokens": 553171.0, "reward": 3.049999952316284, "reward_std": 4.00124979019165, "rewards/reward_combined/mean": 3.049999952316284, "rewards/reward_combined/std": 4.00124979019165, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.12894947826862335, "kl": 0.010606497060507536, "learning_rate": 4.549444444444445e-06, "loss": 0.0005, "num_tokens": 553449.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 33.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.09907938539981842, "kl": 0.004567474126815796, "learning_rate": 4.548888888888889e-06, "loss": 0.0002, "num_tokens": 553669.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 33.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.005775399040430784, "kl": 0.02119988203048706, "learning_rate": 4.5483333333333335e-06, "loss": 0.0011, "num_tokens": 553961.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 33.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.4462563693523407, "kl": 0.048507826402783394, "learning_rate": 4.547777777777778e-06, "loss": 0.0026, "num_tokens": 554222.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 33.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.10013848543167114, "kl": 0.029817594215273857, "learning_rate": 4.547222222222223e-06, "loss": 0.0015, "num_tokens": 554556.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.10899730771780014, "kl": 0.01488139946013689, "learning_rate": 4.546666666666667e-06, "loss": 0.0007, "num_tokens": 554828.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 33.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.14108674228191376, "kl": 0.021150572458282113, "learning_rate": 4.546111111111112e-06, "loss": 0.0009, "num_tokens": 555123.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 33.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.015320072881877422, "kl": 0.0004267394542694092, "learning_rate": 4.545555555555556e-06, "loss": 0.0, "num_tokens": 555419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 33.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.395540952682495, "kl": 0.026413092389702797, "learning_rate": 4.5450000000000005e-06, "loss": 0.0089, "num_tokens": 555740.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 33.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.020289316773414612, "kl": 0.009079569950699806, "learning_rate": 4.544444444444445e-06, "loss": 0.0005, "num_tokens": 556052.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 33.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.058188945055007935, "kl": 0.0025787296472117305, "learning_rate": 4.543888888888889e-06, "loss": 0.0001, "num_tokens": 556317.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 33.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.009787640534341335, "kl": 0.000659596174955368, "learning_rate": 4.543333333333334e-06, "loss": 0.0, "num_tokens": 556561.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 33.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.3351268768310547, "kl": 0.11476648040115833, "learning_rate": 4.542777777777778e-06, "loss": -0.0482, "num_tokens": 556907.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 33.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06226232275366783, "kl": 0.007018388714641333, "learning_rate": 4.542222222222223e-06, "loss": 0.0004, "num_tokens": 557179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 33.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.030865292996168137, "kl": 0.006758928764611483, "learning_rate": 4.541666666666667e-06, "loss": 0.0003, "num_tokens": 557468.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 33.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.01290237158536911, "kl": 0.002668893779627979, "learning_rate": 4.541111111111112e-06, "loss": 0.0001, "num_tokens": 557746.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 33.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.6057872176170349, "kl": 0.14684846252202988, "learning_rate": 4.540555555555556e-06, "loss": 0.0073, "num_tokens": 558046.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 33.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.08441634476184845, "kl": 0.027026322670280933, "learning_rate": 4.540000000000001e-06, "loss": 0.0014, "num_tokens": 558378.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 33.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0138118090108037, "kl": 0.012391222175210714, "learning_rate": 4.539444444444445e-06, "loss": 0.0006, "num_tokens": 558614.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 33.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0411069430410862, "kl": 0.0032614916563034058, "learning_rate": 4.538888888888889e-06, "loss": 0.0002, "num_tokens": 558886.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 33.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.09663110226392746, "kl": 0.006796660367399454, "learning_rate": 4.538333333333334e-06, "loss": 0.0003, "num_tokens": 559140.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1832 }, { "clip_ratio/high_max": 0.005102040711790323, "clip_ratio/high_mean": 0.005102040711790323, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005102040711790323, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 33.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.620103597640991, "kl": 0.03955706022679806, "learning_rate": 4.537777777777778e-06, "loss": 0.081, "num_tokens": 559506.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 33.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.15695130825042725, "kl": 0.03114278637804091, "learning_rate": 4.537222222222223e-06, "loss": 0.0017, "num_tokens": 559828.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 33.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.073749303817749, "kl": 0.07257397100329399, "learning_rate": 4.536666666666667e-06, "loss": 0.2495, "num_tokens": 560221.0, "reward": 2.174999952316284, "reward_std": 2.6500000953674316, "rewards/reward_combined/mean": 2.174999952316284, "rewards/reward_combined/std": 2.6500000953674316, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 34.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.46851447224617004, "kl": 0.07999692484736443, "learning_rate": 4.536111111111112e-06, "loss": 0.0044, "num_tokens": 560558.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 34.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.5341672897338867, "kl": 0.09393883077427745, "learning_rate": 4.535555555555555e-06, "loss": 0.0034, "num_tokens": 560832.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 34.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.12074432522058487, "kl": 0.007861480233259499, "learning_rate": 4.535000000000001e-06, "loss": 0.0004, "num_tokens": 561144.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 34.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.00634710444137454, "kl": 0.008050882956013083, "learning_rate": 4.534444444444445e-06, "loss": 0.0004, "num_tokens": 561366.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 34.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.010675113648176193, "kl": 0.0007605142891407013, "learning_rate": 4.533888888888889e-06, "loss": 0.0, "num_tokens": 561610.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017301038606092334, "clip_ratio/low_min": 0.0017301038606092334, "clip_ratio/region_mean": 0.0017301038606092334, "completion_length": 85.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 85.75, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 34.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 1.574607491493225, "kl": 0.05892890691757202, "learning_rate": 4.533333333333334e-06, "loss": 0.3877, "num_tokens": 562177.0, "reward": 6.300000190734863, "reward_std": 2.3999998569488525, "rewards/reward_combined/mean": 6.300000190734863, "rewards/reward_combined/std": 2.3999998569488525, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 34.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 2.3582303524017334, "kl": 0.03910455573350191, "learning_rate": 4.532777777777778e-06, "loss": 0.1234, "num_tokens": 562525.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 34.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 1.7970280647277832, "kl": 0.03027614066377282, "learning_rate": 4.532222222222223e-06, "loss": 0.002, "num_tokens": 562813.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.22564388811588287, "kl": 0.04481789190322161, "learning_rate": 4.531666666666667e-06, "loss": 0.0023, "num_tokens": 563105.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 34.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.095942422747612, "kl": 0.023734424263238907, "learning_rate": 4.531111111111112e-06, "loss": 0.0012, "num_tokens": 563426.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 34.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.10050618648529053, "kl": 0.02539545949548483, "learning_rate": 4.5305555555555555e-06, "loss": 0.0013, "num_tokens": 563714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 34.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.014623438939452171, "kl": 0.012029822450131178, "learning_rate": 4.530000000000001e-06, "loss": 0.0006, "num_tokens": 563950.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 34.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.06707719713449478, "kl": 0.004107304732315242, "learning_rate": 4.529444444444445e-06, "loss": 0.0002, "num_tokens": 564208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 34.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.008427065797150135, "kl": 0.0010208338499069214, "learning_rate": 4.528888888888889e-06, "loss": 0.0001, "num_tokens": 564420.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 34.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.291226387023926, "kl": 0.07962330058217049, "learning_rate": 4.528333333333334e-06, "loss": 0.0294, "num_tokens": 564774.0, "reward": 3.375, "reward_std": 0.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 0.25, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 34.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.030168039724230766, "kl": 0.0004325956106185913, "learning_rate": 4.527777777777778e-06, "loss": 0.0, "num_tokens": 564978.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 34.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03785755857825279, "kl": 0.0012319423258304596, "learning_rate": 4.5272222222222225e-06, "loss": 0.0001, "num_tokens": 565238.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 34.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.027749942615628242, "kl": 0.026929042302072048, "learning_rate": 4.526666666666667e-06, "loss": 0.0013, "num_tokens": 565506.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 34.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.04531565681099892, "kl": 0.20883135497570038, "learning_rate": 4.526111111111112e-06, "loss": 0.0104, "num_tokens": 565810.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 34.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 7.61016321182251, "kl": 0.02989797480404377, "learning_rate": 4.5255555555555555e-06, "loss": 0.3003, "num_tokens": 566097.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 34.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.06532023102045059, "kl": 0.022275344468653202, "learning_rate": 4.525000000000001e-06, "loss": 0.0011, "num_tokens": 566431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1856 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completion_length": 78.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 78.75, "completions/mean_terminated_length": 19.666667938232422, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 34.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 2.1519689559936523, "kl": 0.038375724107027054, "learning_rate": 4.524444444444444e-06, "loss": 0.4331, "num_tokens": 566970.0, "reward": 5.375, "reward_std": 4.25, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 4.25, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 34.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0590076819062233, "kl": 0.0070655690506100655, "learning_rate": 4.5238888888888894e-06, "loss": 0.0004, "num_tokens": 567241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 34.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.04941357299685478, "kl": 0.0014652087702415884, "learning_rate": 4.523333333333334e-06, "loss": 0.0, "num_tokens": 567489.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 34.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.00983151700347662, "kl": 0.00024252831644844264, "learning_rate": 4.522777777777778e-06, "loss": 0.0, "num_tokens": 567745.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 34.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08553393185138702, "kl": 0.006692105132970028, "learning_rate": 4.5222222222222225e-06, "loss": 0.0004, "num_tokens": 568019.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 34.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.141714096069336, "kl": 0.23021288216114044, "learning_rate": 4.521666666666667e-06, "loss": 0.0352, "num_tokens": 568366.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 34.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02555246092379093, "kl": 0.011066973209381104, "learning_rate": 4.521111111111112e-06, "loss": 0.0006, "num_tokens": 568678.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 34.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10285855084657669, "kl": 0.10053723305463791, "learning_rate": 4.5205555555555556e-06, "loss": 0.005, "num_tokens": 568987.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 34.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.011728818528354168, "kl": 0.02021072618663311, "learning_rate": 4.520000000000001e-06, "loss": 0.001, "num_tokens": 569281.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 34.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.046598002314567566, "kl": 0.09429870545864105, "learning_rate": 4.519444444444444e-06, "loss": 0.0046, "num_tokens": 569546.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 34.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.16966448724269867, "kl": 0.12387192249298096, "learning_rate": 4.5188888888888895e-06, "loss": 0.0057, "num_tokens": 569870.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.018956856802105904, "kl": 0.007135905558243394, "learning_rate": 4.518333333333334e-06, "loss": 0.0004, "num_tokens": 570144.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 34.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017267748480662704, "kl": 0.0183931365609169, "learning_rate": 4.517777777777778e-06, "loss": 0.0009, "num_tokens": 570404.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 34.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.11602456867694855, "kl": 0.08870114525780082, "learning_rate": 4.5172222222222225e-06, "loss": 0.0044, "num_tokens": 570730.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 34.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.1464630365371704, "kl": 0.17373961955308914, "learning_rate": 4.516666666666667e-06, "loss": 0.0087, "num_tokens": 571040.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 34.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.040346626192331314, "kl": 0.0324883759021759, "learning_rate": 4.516111111111111e-06, "loss": 0.0016, "num_tokens": 571256.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 34.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002647245128173381, "kl": 1.0311603546142578e-05, "learning_rate": 4.515555555555556e-06, "loss": 0.0, "num_tokens": 571476.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 34.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.134329080581665, "kl": 0.0727376714348793, "learning_rate": 4.515000000000001e-06, "loss": 0.005, "num_tokens": 571938.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 34.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013776085106655955, "kl": 0.01121530681848526, "learning_rate": 4.514444444444444e-06, "loss": 0.0006, "num_tokens": 572174.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 34.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0819246843457222, "kl": 0.012644737958908081, "learning_rate": 4.5138888888888895e-06, "loss": 0.0006, "num_tokens": 572470.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 34.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.09794357419013977, "kl": 0.009563634637743235, "learning_rate": 4.513333333333333e-06, "loss": 0.0005, "num_tokens": 572752.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 34.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.6375772953033447, "kl": 0.0855606459081173, "learning_rate": 4.512777777777778e-06, "loss": -0.0673, "num_tokens": 573078.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 34.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 6.180176734924316, "kl": 0.17759111896157265, "learning_rate": 4.512222222222223e-06, "loss": -0.0537, "num_tokens": 573428.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 34.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.03578009083867073, "kl": 0.001086108386516571, "learning_rate": 4.511666666666667e-06, "loss": 0.0001, "num_tokens": 573640.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 34.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0706334263086319, "kl": 0.019052274525165558, "learning_rate": 4.511111111111111e-06, "loss": 0.001, "num_tokens": 573944.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 34.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.02748480625450611, "kl": 0.009285209700465202, "learning_rate": 4.510555555555556e-06, "loss": 0.0005, "num_tokens": 574252.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 34.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 1.205411434173584, "kl": 0.16246045753359795, "learning_rate": 4.510000000000001e-06, "loss": 0.0088, "num_tokens": 574579.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 34.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.03379775211215019, "kl": 0.002025329857133329, "learning_rate": 4.509444444444444e-06, "loss": 0.0001, "num_tokens": 574847.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 34.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.013741854578256607, "kl": 0.002825303701683879, "learning_rate": 4.50888888888889e-06, "loss": 0.0001, "num_tokens": 575131.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 34.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.6520989537239075, "kl": 0.07164754346013069, "learning_rate": 4.508333333333333e-06, "loss": 0.0036, "num_tokens": 575427.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 34.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.9775516986846924, "kl": 0.0335043091326952, "learning_rate": 4.507777777777778e-06, "loss": -0.0026, "num_tokens": 575780.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 34.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045790523290634155, "kl": 0.09023147821426392, "learning_rate": 4.507222222222223e-06, "loss": 0.0045, "num_tokens": 576144.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 34.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.011647321283817291, "kl": 0.0014174373354762793, "learning_rate": 4.506666666666667e-06, "loss": 0.0001, "num_tokens": 576462.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010638297535479069, "clip_ratio/low_min": 0.010638297535479069, "clip_ratio/region_mean": 0.010638297535479069, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 35.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.132030010223389, "kl": 0.14002996310591698, "learning_rate": 4.506111111111111e-06, "loss": 0.0148, "num_tokens": 576770.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 35.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010695564560592175, "kl": 0.0014280645991675556, "learning_rate": 4.505555555555556e-06, "loss": 0.0001, "num_tokens": 577089.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013513513840734959, "clip_ratio/low_min": 0.013513513840734959, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.930971622467041, "kl": 0.0874680383130908, "learning_rate": 4.505e-06, "loss": 0.2046, "num_tokens": 577402.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 35.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.025891441851854324, "kl": 0.003957332577556372, "learning_rate": 4.504444444444444e-06, "loss": 0.0002, "num_tokens": 577688.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.04485754296183586, "kl": 0.004490172723308206, "learning_rate": 4.50388888888889e-06, "loss": 0.0002, "num_tokens": 577946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 35.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.057008713483810425, "kl": 0.00614563561975956, "learning_rate": 4.503333333333333e-06, "loss": 0.0003, "num_tokens": 578246.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 35.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 5.040363311767578, "kl": 0.36898268945515156, "learning_rate": 4.502777777777778e-06, "loss": 0.0487, "num_tokens": 578481.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 35.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 6.161988735198975, "kl": 0.06015567108988762, "learning_rate": 4.502222222222223e-06, "loss": 0.1476, "num_tokens": 578763.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 35.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.06465952098369598, "kl": 0.009086916921660304, "learning_rate": 4.501666666666667e-06, "loss": 0.0005, "num_tokens": 579052.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.016987796872854233, "kl": 0.0008008331060409546, "learning_rate": 4.501111111111111e-06, "loss": 0.0, "num_tokens": 579312.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 35.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.06390701234340668, "kl": 0.008168598171323538, "learning_rate": 4.500555555555556e-06, "loss": 0.0004, "num_tokens": 579572.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 35.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.020316727459430695, "kl": 0.0008193471876438707, "learning_rate": 4.5e-06, "loss": 0.0, "num_tokens": 579835.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.07206843793392181, "kl": 0.007231655530631542, "learning_rate": 4.4994444444444445e-06, "loss": 0.0004, "num_tokens": 580109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 35.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09461364895105362, "kl": 0.011085071135312319, "learning_rate": 4.49888888888889e-06, "loss": 0.0005, "num_tokens": 580385.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 35.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07826594263315201, "kl": 0.0026755332946777344, "learning_rate": 4.498333333333333e-06, "loss": 0.0001, "num_tokens": 580605.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 35.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.06632345914840698, "kl": 0.002940857782959938, "learning_rate": 4.497777777777778e-06, "loss": 0.0001, "num_tokens": 580873.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 35.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.001267787883989513, "kl": 0.011253692209720612, "learning_rate": 4.497222222222223e-06, "loss": 0.0006, "num_tokens": 581109.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 35.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.08794524520635605, "kl": 0.01356555218808353, "learning_rate": 4.496666666666667e-06, "loss": 0.0006, "num_tokens": 581432.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 35.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 7.0690460205078125, "kl": 0.08899215515702963, "learning_rate": 4.4961111111111115e-06, "loss": 0.1377, "num_tokens": 581778.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 7.597713947296143, "kl": 0.03623668011277914, "learning_rate": 4.495555555555556e-06, "loss": 0.0126, "num_tokens": 582062.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 35.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0052820416167378426, "kl": 0.06209264136850834, "learning_rate": 4.495e-06, "loss": 0.0031, "num_tokens": 582514.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 35.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0062948307022452354, "kl": 0.008099866099655628, "learning_rate": 4.4944444444444445e-06, "loss": 0.0004, "num_tokens": 582826.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 35.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.020914960652589798, "kl": 0.007740733562968671, "learning_rate": 4.49388888888889e-06, "loss": 0.0004, "num_tokens": 583048.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1912 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 35.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 2.4939208030700684, "kl": 0.11887087672948837, "learning_rate": 4.493333333333333e-06, "loss": -0.1, "num_tokens": 583367.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1913 }, { "clip_ratio/high_max": 0.016949152573943138, "clip_ratio/high_mean": 0.016949152573943138, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016949152573943138, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 35.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 1.7933999300003052, "kl": 0.1358058601617813, "learning_rate": 4.4927777777777784e-06, "loss": -0.0151, "num_tokens": 583725.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 35.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0838281512260437, "kl": 0.0034071356058120728, "learning_rate": 4.492222222222223e-06, "loss": 0.0002, "num_tokens": 583931.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 35.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09553811699151993, "kl": 0.006266951095312834, "learning_rate": 4.491666666666667e-06, "loss": 0.0003, "num_tokens": 584247.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 35.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.3728790283203125, "kl": 0.0687653198838234, "learning_rate": 4.4911111111111115e-06, "loss": 0.0933, "num_tokens": 584659.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 35.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06606964766979218, "kl": 0.014580248389393091, "learning_rate": 4.490555555555556e-06, "loss": 0.0007, "num_tokens": 584985.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 35.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008691217750310898, "kl": 0.001905098557472229, "learning_rate": 4.49e-06, "loss": 0.0001, "num_tokens": 585197.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 35.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021677077747881413, "kl": 0.00013440847396850586, "learning_rate": 4.4894444444444446e-06, "loss": 0.0, "num_tokens": 585453.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 35.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 3.6774778366088867, "kl": 0.21074063330888748, "learning_rate": 4.488888888888889e-06, "loss": 0.059, "num_tokens": 585802.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 35.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.15146446228027344, "kl": 0.02675052545964718, "learning_rate": 4.488333333333333e-06, "loss": 0.0013, "num_tokens": 586108.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 35.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 2.8107898235321045, "kl": 0.1742212101817131, "learning_rate": 4.4877777777777785e-06, "loss": -0.0007, "num_tokens": 586459.0, "reward": 2.75, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.190238118171692, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 35.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.004698503762483597, "kl": 0.09019893407821655, "learning_rate": 4.487222222222223e-06, "loss": 0.0045, "num_tokens": 586823.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 35.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.03768645226955414, "kl": 0.0047996582579799, "learning_rate": 4.486666666666667e-06, "loss": 0.0002, "num_tokens": 587083.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 35.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.01762758381664753, "kl": 0.0007303506135940552, "learning_rate": 4.4861111111111115e-06, "loss": 0.0, "num_tokens": 587295.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 35.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0356055349111557, "kl": 0.05251556262373924, "learning_rate": 4.485555555555556e-06, "loss": 0.0026, "num_tokens": 587627.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1927 }, { "clip_ratio/high_max": 0.009433962404727936, "clip_ratio/high_mean": 0.009433962404727936, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009433962404727936, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 35.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.850531578063965, "kl": 0.09347373992204666, "learning_rate": 4.485e-06, "loss": 0.0814, "num_tokens": 587954.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 35.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.2058262825012207, "kl": 0.03270742669701576, "learning_rate": 4.484444444444445e-06, "loss": -0.1885, "num_tokens": 588305.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.891814708709717, "kl": 0.30814019218087196, "learning_rate": 4.483888888888889e-06, "loss": 0.2122, "num_tokens": 588623.0, "reward": 4.25, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 2.1794495582580566, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 35.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08725087344646454, "kl": 0.0435319896787405, "learning_rate": 4.483333333333333e-06, "loss": 0.0022, "num_tokens": 588919.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 35.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 4.73525857925415, "kl": 0.03620629012584686, "learning_rate": 4.4827777777777785e-06, "loss": 0.2393, "num_tokens": 589245.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.17594671249389648, "kl": 0.04390465281903744, "learning_rate": 4.482222222222223e-06, "loss": 0.0022, "num_tokens": 589513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 35.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.048359595239162445, "kl": 0.009993799962103367, "learning_rate": 4.481666666666667e-06, "loss": 0.0005, "num_tokens": 589785.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 35.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.06564460694789886, "kl": 0.16547507047653198, "learning_rate": 4.481111111111112e-06, "loss": 0.0083, "num_tokens": 590095.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 35.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 4.1623005867004395, "kl": 0.0593174546957016, "learning_rate": 4.480555555555556e-06, "loss": 0.093, "num_tokens": 590406.0, "reward": 4.875, "reward_std": 3.1983067989349365, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.1983067989349365, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 35.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.007981108501553535, "kl": 0.020120983012020588, "learning_rate": 4.48e-06, "loss": 0.001, "num_tokens": 590700.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 35.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.08903476595878601, "kl": 0.2223193272948265, "learning_rate": 4.479444444444445e-06, "loss": 0.0111, "num_tokens": 591002.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 35.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009722707909531891, "kl": 0.01852957345545292, "learning_rate": 4.478888888888889e-06, "loss": 0.0009, "num_tokens": 591262.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 35.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.11470168828964233, "kl": 0.03125123027712107, "learning_rate": 4.478333333333334e-06, "loss": 0.0016, "num_tokens": 591553.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 35.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.12193123251199722, "kl": 0.10795196145772934, "learning_rate": 4.477777777777778e-06, "loss": 0.0054, "num_tokens": 591857.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 35.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07386869192123413, "kl": 0.012261404190212488, "learning_rate": 4.477222222222223e-06, "loss": 0.0006, "num_tokens": 592145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 35.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.1412280648946762, "kl": 0.045462075620889664, "learning_rate": 4.476666666666667e-06, "loss": 0.002, "num_tokens": 592427.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.018355702981352806, "kl": 0.03065376728773117, "learning_rate": 4.476111111111112e-06, "loss": 0.0015, "num_tokens": 592643.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 36.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.8208017349243164, "kl": 0.2091994732618332, "learning_rate": 4.475555555555556e-06, "loss": 0.0071, "num_tokens": 592947.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 36.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.15264630317688, "kl": 0.07286170870065689, "learning_rate": 4.475e-06, "loss": 0.0361, "num_tokens": 593295.0, "reward": 4.75, "reward_std": 3.617089033126831, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.61708927154541, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 36.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.023903323337435722, "kl": 0.003138929605484009, "learning_rate": 4.474444444444445e-06, "loss": 0.0002, "num_tokens": 593503.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 36.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.37687617540359497, "kl": 0.016563307493925095, "learning_rate": 4.473888888888889e-06, "loss": 0.0012, "num_tokens": 593719.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 36.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010043643414974213, "kl": 0.01852165348827839, "learning_rate": 4.473333333333334e-06, "loss": 0.0009, "num_tokens": 593979.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 36.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.23537446558475494, "kl": 0.1194453313946724, "learning_rate": 4.472777777777778e-06, "loss": 0.0059, "num_tokens": 594340.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.007630498148500919, "kl": 0.002547873998992145, "learning_rate": 4.472222222222223e-06, "loss": 0.0001, "num_tokens": 594624.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.04168723151087761, "kl": 0.02332253661006689, "learning_rate": 4.471666666666667e-06, "loss": 0.0013, "num_tokens": 594913.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.07385408133268356, "kl": 0.033542269840836525, "learning_rate": 4.471111111111112e-06, "loss": 0.0017, "num_tokens": 595181.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 36.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.5020606517791748, "kl": 0.04608247522264719, "learning_rate": 4.470555555555556e-06, "loss": 0.0027, "num_tokens": 595446.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 36.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005351410713046789, "kl": 0.0014217197895050049, "learning_rate": 4.47e-06, "loss": 0.0001, "num_tokens": 595666.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 36.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.08577816188335419, "kl": 0.01677424064837396, "learning_rate": 4.469444444444445e-06, "loss": 0.0008, "num_tokens": 595990.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 36.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 6.1210832595825195, "kl": 0.08660689368844032, "learning_rate": 4.468888888888889e-06, "loss": 0.0517, "num_tokens": 596348.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 36.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.17870907485485077, "kl": 0.037875059992074966, "learning_rate": 4.468333333333334e-06, "loss": 0.0019, "num_tokens": 596674.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.791686773300171, "kl": 0.10894937813282013, "learning_rate": 4.467777777777778e-06, "loss": 0.0049, "num_tokens": 596938.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.006980588659644127, "kl": 0.00048070604680106044, "learning_rate": 4.467222222222223e-06, "loss": 0.0, "num_tokens": 597204.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 36.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.035292722284793854, "kl": 0.0023712898837402463, "learning_rate": 4.4666666666666665e-06, "loss": 0.0001, "num_tokens": 597438.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.02056741714477539, "kl": 0.028837352991104126, "learning_rate": 4.466111111111112e-06, "loss": 0.0014, "num_tokens": 597654.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.06852855533361435, "kl": 0.00877176783978939, "learning_rate": 4.465555555555556e-06, "loss": 0.0004, "num_tokens": 597952.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058617740869522095, "kl": 0.0001225396990776062, "learning_rate": 4.4650000000000004e-06, "loss": 0.0, "num_tokens": 598164.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 36.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.09185619652271271, "kl": 0.01079381350427866, "learning_rate": 4.464444444444445e-06, "loss": 0.0008, "num_tokens": 598427.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 36.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.2083051204681396, "kl": 0.06450754217803478, "learning_rate": 4.463888888888889e-06, "loss": 0.0181, "num_tokens": 598779.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 1.6278712791972794e-05, "kl": 4.000961780548096e-06, "learning_rate": 4.463333333333334e-06, "loss": 0.0, "num_tokens": 598999.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 5.620673656463623, "kl": 0.07756593450903893, "learning_rate": 4.462777777777778e-06, "loss": 0.1383, "num_tokens": 599280.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 36.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.14624810218811035, "kl": 0.07022104412317276, "learning_rate": 4.462222222222223e-06, "loss": 0.0034, "num_tokens": 599608.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 36.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004372062161564827, "kl": 0.09019581973552704, "learning_rate": 4.461666666666667e-06, "loss": 0.0045, "num_tokens": 599972.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05967795103788376, "kl": 0.009160356014035642, "learning_rate": 4.461111111111112e-06, "loss": 0.0005, "num_tokens": 600248.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 36.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.4540921449661255, "kl": 0.048416554229334, "learning_rate": 4.460555555555556e-06, "loss": 0.0023, "num_tokens": 600576.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 36.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07966338098049164, "kl": 0.01251540333032608, "learning_rate": 4.4600000000000005e-06, "loss": 0.0006, "num_tokens": 600836.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 36.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04618893936276436, "kl": 0.007898394949734211, "learning_rate": 4.459444444444445e-06, "loss": 0.0004, "num_tokens": 601142.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 36.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.005562911741435528, "kl": 0.0012161374324932694, "learning_rate": 4.458888888888889e-06, "loss": 0.0001, "num_tokens": 601402.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 36.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.09940361976623535, "kl": 0.008270694874227047, "learning_rate": 4.4583333333333336e-06, "loss": 0.0004, "num_tokens": 601668.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 36.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.06252198666334152, "kl": 0.006913790479302406, "learning_rate": 4.457777777777778e-06, "loss": 0.0003, "num_tokens": 601956.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 36.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.006098712794482708, "kl": 0.008249427191913128, "learning_rate": 4.457222222222223e-06, "loss": 0.0004, "num_tokens": 602268.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 36.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 3.461517572402954, "kl": 0.14498266018927097, "learning_rate": 4.456666666666667e-06, "loss": -0.0575, "num_tokens": 602621.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 36.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.18718072772026062, "kl": 0.07160401530563831, "learning_rate": 4.456111111111112e-06, "loss": 0.0035, "num_tokens": 602945.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 36.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.02434297651052475, "kl": 0.1277841366827488, "learning_rate": 4.455555555555555e-06, "loss": 0.0064, "num_tokens": 603255.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07553129643201828, "kl": 0.013828820083290339, "learning_rate": 4.4550000000000005e-06, "loss": 0.0007, "num_tokens": 603553.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012820512987673283, "clip_ratio/low_min": 0.012820512987673283, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 36.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.770021915435791, "kl": 0.051053768023848534, "learning_rate": 4.454444444444445e-06, "loss": 0.012, "num_tokens": 603882.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 36.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03304231911897659, "kl": 0.0007398699526675045, "learning_rate": 4.453888888888889e-06, "loss": 0.0, "num_tokens": 604150.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 36.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.6511738300323486, "kl": 0.08937395364046097, "learning_rate": 4.453333333333334e-06, "loss": 0.3, "num_tokens": 604505.0, "reward": 3.5, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 4.690415859222412, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 36.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 1.519407868385315, "kl": 0.33459360525012016, "learning_rate": 4.452777777777778e-06, "loss": 0.0144, "num_tokens": 604795.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03875732421875, "kl": 0.002911806936026551, "learning_rate": 4.452222222222223e-06, "loss": 0.0001, "num_tokens": 605074.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 36.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.08745433390140533, "kl": 0.014163524378091097, "learning_rate": 4.451666666666667e-06, "loss": 0.0007, "num_tokens": 605369.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 4.227729320526123, "kl": 0.13154592365026474, "learning_rate": 4.451111111111112e-06, "loss": -0.0135, "num_tokens": 605663.0, "reward": 6.5, "reward_std": 2.345207929611206, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.345207929611206, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 36.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.001505248830653727, "kl": 0.011179938912391663, "learning_rate": 4.450555555555555e-06, "loss": 0.0006, "num_tokens": 605899.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 36.870370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 3.488132953643799, "kl": 0.08701146021485329, "learning_rate": 4.450000000000001e-06, "loss": 0.0369, "num_tokens": 606212.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 36.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.08379175513982773, "kl": 0.023225650191307068, "learning_rate": 4.449444444444445e-06, "loss": 0.0012, "num_tokens": 606546.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 71.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 71.5, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 36.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.124232769012451, "kl": 0.02370841335505247, "learning_rate": 4.448888888888889e-06, "loss": 0.4631, "num_tokens": 607052.0, "reward": 6.625, "reward_std": 1.75, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 1.75, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 36.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.04027624800801277, "kl": 0.008440040051937103, "learning_rate": 4.448333333333334e-06, "loss": 0.0004, "num_tokens": 607332.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 36.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.02771504409611225, "kl": 0.001874844660051167, "learning_rate": 4.447777777777778e-06, "loss": 0.0001, "num_tokens": 607646.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 36.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.7017443180084229, "kl": 0.4568101726472378, "learning_rate": 4.447222222222222e-06, "loss": -0.0153, "num_tokens": 608092.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 1996 }, { "clip_ratio/high_max": 0.0030674845911562443, "clip_ratio/high_mean": 0.0030674845911562443, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030674845911562443, "completion_length": 51.75, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 36.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.6528329849243164, "kl": 0.040492214262485504, "learning_rate": 4.446666666666667e-06, "loss": -0.4078, "num_tokens": 608523.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 37.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.02664884924888611, "kl": 0.0004827946540899575, "learning_rate": 4.446111111111112e-06, "loss": 0.0, "num_tokens": 608779.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 37.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08247847855091095, "kl": 0.0468553826212883, "learning_rate": 4.4455555555555554e-06, "loss": 0.0023, "num_tokens": 609148.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 37.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05346888676285744, "kl": 0.005455116042867303, "learning_rate": 4.445000000000001e-06, "loss": 0.0003, "num_tokens": 609422.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 37.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.08068736642599106, "kl": 0.006376167526468635, "learning_rate": 4.444444444444444e-06, "loss": 0.0003, "num_tokens": 609683.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2001 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.010869565419852734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010869565419852734, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 37.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 4.611499309539795, "kl": 0.08839858323335648, "learning_rate": 4.443888888888889e-06, "loss": 0.179, "num_tokens": 610014.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 37.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.012458153069019318, "kl": 0.0014928989112377167, "learning_rate": 4.443333333333334e-06, "loss": 0.0001, "num_tokens": 610274.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 37.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.07985608279705048, "kl": 0.07067974656820297, "learning_rate": 4.442777777777778e-06, "loss": 0.0036, "num_tokens": 610754.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 37.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.001011303742416203, "kl": 5.425512608780991e-05, "learning_rate": 4.442222222222222e-06, "loss": 0.0, "num_tokens": 611026.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 37.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.2282446175813675, "kl": 0.013594793621450663, "learning_rate": 4.441666666666667e-06, "loss": 0.0007, "num_tokens": 611298.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 37.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.08701394498348236, "kl": 0.005737423896789551, "learning_rate": 4.441111111111112e-06, "loss": 0.0003, "num_tokens": 611510.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 37.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.059742528945207596, "kl": 0.002412238740362227, "learning_rate": 4.4405555555555555e-06, "loss": 0.0001, "num_tokens": 611773.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 37.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.040785375982522964, "kl": 0.007690551690757275, "learning_rate": 4.440000000000001e-06, "loss": 0.0004, "num_tokens": 612057.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 37.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.002117918571457267, "kl": 0.0024721783120185137, "learning_rate": 4.439444444444444e-06, "loss": 0.0001, "num_tokens": 612341.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 37.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.037244763225317, "kl": 0.006303668953478336, "learning_rate": 4.438888888888889e-06, "loss": 0.0003, "num_tokens": 612643.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 37.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07834801822900772, "kl": 0.013212065678089857, "learning_rate": 4.438333333333334e-06, "loss": 0.0007, "num_tokens": 612960.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 37.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.009845548309385777, "kl": 0.0018860234413295984, "learning_rate": 4.437777777777778e-06, "loss": 0.0001, "num_tokens": 613272.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 37.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007154408376663923, "kl": 0.0013248592149466276, "learning_rate": 4.4372222222222225e-06, "loss": 0.0001, "num_tokens": 613532.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 37.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.08333168923854828, "kl": 0.019226177595555782, "learning_rate": 4.436666666666667e-06, "loss": 0.001, "num_tokens": 613846.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 37.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0974864736199379, "kl": 0.23616840690374374, "learning_rate": 4.436111111111111e-06, "loss": 0.0117, "num_tokens": 614148.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 37.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.02791820652782917, "kl": 0.0019713714718818665, "learning_rate": 4.4355555555555555e-06, "loss": 0.0001, "num_tokens": 614356.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 37.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 3.3818936347961426, "kl": 0.02285052416846156, "learning_rate": 4.435000000000001e-06, "loss": -0.0588, "num_tokens": 614643.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 37.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 3.439682722091675, "kl": 0.05032962281256914, "learning_rate": 4.434444444444444e-06, "loss": 0.2697, "num_tokens": 614947.0, "reward": 4.875, "reward_std": 3.1983067989349365, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.1983067989349365, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 37.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001436203601770103, "kl": 8.262693881988525e-06, "learning_rate": 4.4338888888888894e-06, "loss": 0.0, "num_tokens": 615167.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 37.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.030241578817367554, "kl": 0.005909159546718001, "learning_rate": 4.433333333333334e-06, "loss": 0.0003, "num_tokens": 615461.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 37.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 11.732074737548828, "kl": 0.0038502190727740526, "learning_rate": 4.432777777777778e-06, "loss": 0.2377, "num_tokens": 615681.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 37.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.4847821891307831, "kl": 0.09767242148518562, "learning_rate": 4.4322222222222225e-06, "loss": 0.0047, "num_tokens": 615954.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 37.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.023622209206223488, "kl": 0.018268076702952385, "learning_rate": 4.431666666666667e-06, "loss": 0.0009, "num_tokens": 616175.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 37.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017216292908415198, "kl": 0.01835672650486231, "learning_rate": 4.431111111111111e-06, "loss": 0.0009, "num_tokens": 616435.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 37.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.7610645294189453, "kl": 0.005284200189635158, "learning_rate": 4.430555555555556e-06, "loss": 0.2345, "num_tokens": 616736.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 37.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.11876385658979416, "kl": 0.021717418916523457, "learning_rate": 4.430000000000001e-06, "loss": 0.0011, "num_tokens": 617031.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 37.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.026050830259919167, "kl": 0.01819790992885828, "learning_rate": 4.429444444444444e-06, "loss": 0.0009, "num_tokens": 617366.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 37.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.1107461228966713, "kl": 0.09556860104203224, "learning_rate": 4.4288888888888895e-06, "loss": 0.0049, "num_tokens": 617664.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2029 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.012820512987673283, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 37.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 3.003286361694336, "kl": 0.08356158062815666, "learning_rate": 4.428333333333334e-06, "loss": -0.0441, "num_tokens": 617976.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 37.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.028956301510334015, "kl": 0.003346409182995558, "learning_rate": 4.427777777777778e-06, "loss": 0.0002, "num_tokens": 618212.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 37.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 2.873485803604126, "kl": 0.09995235502719879, "learning_rate": 4.4272222222222226e-06, "loss": 0.027, "num_tokens": 618553.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 37.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 6.260163307189941, "kl": 0.1673351339995861, "learning_rate": 4.426666666666667e-06, "loss": -0.0176, "num_tokens": 618899.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 37.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 4.691178321838379, "kl": 0.14619768410921097, "learning_rate": 4.426111111111111e-06, "loss": 0.0967, "num_tokens": 619217.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 37.68518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 4.622272968292236, "kl": 0.10412169620394707, "learning_rate": 4.425555555555556e-06, "loss": 0.0411, "num_tokens": 619527.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 37.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08443807065486908, "kl": 0.021859318017959595, "learning_rate": 4.425e-06, "loss": 0.0011, "num_tokens": 619861.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 37.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.9471542835235596, "kl": 0.11890756711363792, "learning_rate": 4.424444444444444e-06, "loss": -0.0821, "num_tokens": 620195.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 37.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.026802150532603264, "kl": 0.011750641278922558, "learning_rate": 4.4238888888888895e-06, "loss": 0.0006, "num_tokens": 620491.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 37.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.783059597015381, "kl": 0.14122093468904495, "learning_rate": 4.423333333333334e-06, "loss": 0.1314, "num_tokens": 620767.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 37.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.059608008712530136, "kl": 0.005075703840702772, "learning_rate": 4.422777777777778e-06, "loss": 0.0003, "num_tokens": 621100.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 37.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.016223814338445663, "kl": 0.0004413557326188311, "learning_rate": 4.422222222222223e-06, "loss": 0.0, "num_tokens": 621378.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2041 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 37.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 3.215750217437744, "kl": 0.06362145766615868, "learning_rate": 4.421666666666667e-06, "loss": 0.0203, "num_tokens": 621720.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 37.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.012375343590974808, "kl": 0.0018283352255821228, "learning_rate": 4.421111111111111e-06, "loss": 0.0001, "num_tokens": 621964.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 37.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.011838962323963642, "kl": 0.033215299248695374, "learning_rate": 4.420555555555556e-06, "loss": 0.0017, "num_tokens": 622232.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 37.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0394209586083889, "kl": 0.046022411435842514, "learning_rate": 4.42e-06, "loss": 0.0023, "num_tokens": 622526.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 37.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.004685268737375736, "kl": 0.09016633033752441, "learning_rate": 4.419444444444444e-06, "loss": 0.0045, "num_tokens": 622890.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 37.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.1707593947649002, "kl": 0.06575945764780045, "learning_rate": 4.41888888888889e-06, "loss": 0.0032, "num_tokens": 623218.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 37.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.006325181107968092, "kl": 0.001667749893385917, "learning_rate": 4.418333333333334e-06, "loss": 0.0001, "num_tokens": 623438.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 37.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.024446256458759308, "kl": 0.006525122094899416, "learning_rate": 4.417777777777778e-06, "loss": 0.0003, "num_tokens": 623770.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 37.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026897324714809656, "kl": 0.000232198835874442, "learning_rate": 4.417222222222223e-06, "loss": 0.0, "num_tokens": 624026.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 37.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027322133537381887, "kl": 0.010990820825099945, "learning_rate": 4.416666666666667e-06, "loss": 0.0005, "num_tokens": 624262.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 38.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.034206222742795944, "kl": 0.0073572953697294, "learning_rate": 4.416111111111111e-06, "loss": 0.0004, "num_tokens": 624550.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 38.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.796631336212158, "kl": 0.035745106637477875, "learning_rate": 4.415555555555556e-06, "loss": -0.1017, "num_tokens": 624913.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 38.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.3930623531341553, "kl": 0.05602386221289635, "learning_rate": 4.415e-06, "loss": 0.0029, "num_tokens": 625242.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 38.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.041085582226514816, "kl": 0.0074393125250935555, "learning_rate": 4.4144444444444444e-06, "loss": 0.0004, "num_tokens": 625574.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.025568222627043724, "kl": 0.005333932116627693, "learning_rate": 4.41388888888889e-06, "loss": 0.0003, "num_tokens": 625860.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 38.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 3.6102309226989746, "kl": 0.8366698175668716, "learning_rate": 4.413333333333334e-06, "loss": -0.0163, "num_tokens": 626199.0, "reward": 1.875, "reward_std": 2.9261748790740967, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 2.9261748790740967, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 1.6299822330474854, "kl": 0.19112237391527742, "learning_rate": 4.412777777777778e-06, "loss": 0.0085, "num_tokens": 626418.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 38.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.20101316273212433, "kl": 0.1007685512304306, "learning_rate": 4.412222222222223e-06, "loss": 0.005, "num_tokens": 626721.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.056715719401836395, "kl": 0.03460953291505575, "learning_rate": 4.411666666666667e-06, "loss": 0.0018, "num_tokens": 627013.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 38.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.031010687351226807, "kl": 0.0015866982867009938, "learning_rate": 4.411111111111111e-06, "loss": 0.0001, "num_tokens": 627247.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 38.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 1.6748607158660889, "kl": 0.19523620000109076, "learning_rate": 4.410555555555556e-06, "loss": 0.0121, "num_tokens": 627514.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036280457861721516, "kl": 0.002374062198214233, "learning_rate": 4.41e-06, "loss": 0.0001, "num_tokens": 627798.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001677630643825978, "kl": 1.3433396816253662e-05, "learning_rate": 4.409444444444445e-06, "loss": 0.0, "num_tokens": 628018.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 38.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.25331035256385803, "kl": 0.02228901907801628, "learning_rate": 4.408888888888889e-06, "loss": 0.0011, "num_tokens": 628284.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 38.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.054470326751470566, "kl": 0.0037299368414096534, "learning_rate": 4.408333333333334e-06, "loss": 0.0002, "num_tokens": 628554.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 38.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.07952671498060226, "kl": 0.04304840788245201, "learning_rate": 4.407777777777778e-06, "loss": 0.0022, "num_tokens": 628886.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 38.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.017032664269208908, "kl": 0.00030316709307953715, "learning_rate": 4.407222222222223e-06, "loss": 0.0, "num_tokens": 629142.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 38.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.028431352227926254, "kl": 0.008348799776285887, "learning_rate": 4.406666666666667e-06, "loss": 0.0004, "num_tokens": 629447.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 38.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.11938723921775818, "kl": 0.010894098784774542, "learning_rate": 4.4061111111111115e-06, "loss": 0.0005, "num_tokens": 629723.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 38.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.11039452999830246, "kl": 0.08745656907558441, "learning_rate": 4.405555555555556e-06, "loss": 0.0044, "num_tokens": 630041.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 38.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.07577838748693466, "kl": 0.015780379995703697, "learning_rate": 4.405e-06, "loss": 0.0007, "num_tokens": 630315.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 11.349297523498535, "kl": 0.0532870456809178, "learning_rate": 4.404444444444445e-06, "loss": -0.2092, "num_tokens": 630569.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 38.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.04384300485253334, "kl": 0.0021740496158599854, "learning_rate": 4.403888888888889e-06, "loss": 0.0001, "num_tokens": 630781.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 38.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.1485247015953064, "kl": 0.015317912679165602, "learning_rate": 4.403333333333334e-06, "loss": 0.0008, "num_tokens": 631077.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 138.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 38.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 1.160291314125061, "kl": 0.15550267696380615, "learning_rate": 4.4027777777777784e-06, "loss": 0.5694, "num_tokens": 631853.0, "reward": 0.7999999523162842, "reward_std": 4.809019565582275, "rewards/reward_combined/mean": 0.7999999523162842, "rewards/reward_combined/std": 4.809019565582275, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 38.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09107646346092224, "kl": 0.008230413310229778, "learning_rate": 4.402222222222223e-06, "loss": 0.0004, "num_tokens": 632123.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.3601531982421875, "kl": 0.022789071314036846, "learning_rate": 4.401666666666667e-06, "loss": 0.2342, "num_tokens": 632417.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 38.5, "frac_reward_zero_std": 0.0, "grad_norm": 6.479309558868408, "kl": 0.04705419950187206, "learning_rate": 4.4011111111111115e-06, "loss": -0.0218, "num_tokens": 632747.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 38.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010891436599195004, "kl": 0.03345745801925659, "learning_rate": 4.400555555555556e-06, "loss": 0.0017, "num_tokens": 633015.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 38.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.14173443615436554, "kl": 0.0184175877366215, "learning_rate": 4.4e-06, "loss": 0.0009, "num_tokens": 633326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 38.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.031929753720760345, "kl": 0.12125849723815918, "learning_rate": 4.3994444444444454e-06, "loss": 0.0061, "num_tokens": 633632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 38.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.07912478595972061, "kl": 0.019214711617678404, "learning_rate": 4.398888888888889e-06, "loss": 0.001, "num_tokens": 633924.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 38.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.01665710285305977, "kl": 0.0019461624324321747, "learning_rate": 4.398333333333334e-06, "loss": 0.0001, "num_tokens": 634184.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 38.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.03019752912223339, "kl": 0.0021870146738365293, "learning_rate": 4.397777777777778e-06, "loss": 0.0001, "num_tokens": 634427.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 38.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.21005718410015106, "kl": 0.13446243107318878, "learning_rate": 4.397222222222223e-06, "loss": 0.0066, "num_tokens": 634690.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 38.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.12801440060138702, "kl": 0.0115868104621768, "learning_rate": 4.396666666666667e-06, "loss": 0.0006, "num_tokens": 634944.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 38.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 4.188738822937012, "kl": 0.0990917906165123, "learning_rate": 4.3961111111111116e-06, "loss": 0.1623, "num_tokens": 635341.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 38.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.12854702770709991, "kl": 0.027095475234091282, "learning_rate": 4.395555555555556e-06, "loss": 0.0014, "num_tokens": 635637.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 38.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015110017266124487, "kl": 0.01836683414876461, "learning_rate": 4.395e-06, "loss": 0.0009, "num_tokens": 635897.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 38.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.08467625826597214, "kl": 0.004262618720531464, "learning_rate": 4.3944444444444455e-06, "loss": 0.0003, "num_tokens": 636105.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 38.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.009488685056567192, "kl": 0.0010586967691779137, "learning_rate": 4.393888888888889e-06, "loss": 0.0001, "num_tokens": 636424.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 38.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04486318677663803, "kl": 0.22794969379901886, "learning_rate": 4.393333333333334e-06, "loss": 0.0114, "num_tokens": 636726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 38.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.5647599697113037, "kl": 0.060825543478131294, "learning_rate": 4.392777777777778e-06, "loss": 0.1146, "num_tokens": 637043.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 38.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.7676024436950684, "kl": 0.029563307762145996, "learning_rate": 4.392222222222223e-06, "loss": 0.0829, "num_tokens": 637376.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 38.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.027673054486513138, "kl": 0.041923897340893745, "learning_rate": 4.391666666666667e-06, "loss": 0.0021, "num_tokens": 637826.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 38.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 0.4528777301311493, "kl": 0.43711068481206894, "learning_rate": 4.391111111111112e-06, "loss": 0.0219, "num_tokens": 638190.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 38.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.8188351392745972, "kl": 0.08224657364189625, "learning_rate": 4.390555555555556e-06, "loss": 0.0046, "num_tokens": 638490.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2098 }, { "clip_ratio/high_max": 0.01923076994717121, "clip_ratio/high_mean": 0.01923076994717121, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01923076994717121, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.870370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 6.743462562561035, "kl": 0.034806590527296066, "learning_rate": 4.39e-06, "loss": 0.2343, "num_tokens": 638724.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 38.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 4.862011432647705, "kl": 0.19079793989658356, "learning_rate": 4.389444444444445e-06, "loss": 0.076, "num_tokens": 639038.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033022514544427395, "kl": 0.010803289711475372, "learning_rate": 4.388888888888889e-06, "loss": 0.0005, "num_tokens": 639274.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 38.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.3887422978878021, "kl": 0.07053389132488519, "learning_rate": 4.388333333333334e-06, "loss": 0.0041, "num_tokens": 639547.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 38.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07025422900915146, "kl": 0.011417937465012074, "learning_rate": 4.387777777777778e-06, "loss": 0.0006, "num_tokens": 639863.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 38.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05428071692585945, "kl": 0.008804447948932648, "learning_rate": 4.387222222222223e-06, "loss": 0.0004, "num_tokens": 640075.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 38.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0647546723484993, "kl": 0.011865920387208462, "learning_rate": 4.3866666666666665e-06, "loss": 0.0006, "num_tokens": 640387.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04933720827102661, "kl": 0.008435897529125214, "learning_rate": 4.386111111111112e-06, "loss": 0.0004, "num_tokens": 640676.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 39.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08785158395767212, "kl": 0.00794348819181323, "learning_rate": 4.385555555555556e-06, "loss": 0.0004, "num_tokens": 640990.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 39.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.20421867072582245, "kl": 0.042918263003230095, "learning_rate": 4.385e-06, "loss": 0.0022, "num_tokens": 641345.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05645808205008507, "kl": 0.060418942011892796, "learning_rate": 4.384444444444445e-06, "loss": 0.0031, "num_tokens": 641638.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 39.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.15772253274917603, "kl": 0.021957566030323505, "learning_rate": 4.383888888888889e-06, "loss": 0.0011, "num_tokens": 641873.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 39.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 4.548757076263428, "kl": 0.048912825994193554, "learning_rate": 4.383333333333334e-06, "loss": 0.148, "num_tokens": 642164.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.17403757572174072, "kl": 0.0955604650080204, "learning_rate": 4.382777777777778e-06, "loss": 0.0048, "num_tokens": 642430.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 39.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037014675326645374, "kl": 0.0016928613185882568, "learning_rate": 4.382222222222223e-06, "loss": 0.0001, "num_tokens": 642690.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 39.148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 4.348334312438965, "kl": 0.03878900036215782, "learning_rate": 4.3816666666666665e-06, "loss": 0.037, "num_tokens": 643017.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.5, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 39.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.6507596373558044, "kl": 0.11286187916994095, "learning_rate": 4.381111111111112e-06, "loss": 0.0058, "num_tokens": 643419.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.13248348236083984, "kl": 0.23639286309480667, "learning_rate": 4.380555555555556e-06, "loss": 0.0118, "num_tokens": 643723.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 39.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01536649465560913, "kl": 0.03384912759065628, "learning_rate": 4.38e-06, "loss": 0.0017, "num_tokens": 643939.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 39.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.031004177406430244, "kl": 0.0062807416543364525, "learning_rate": 4.379444444444445e-06, "loss": 0.0003, "num_tokens": 644251.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 39.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014842083910480142, "kl": 0.018363043665885925, "learning_rate": 4.378888888888889e-06, "loss": 0.0009, "num_tokens": 644511.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06732144206762314, "kl": 0.12645094096660614, "learning_rate": 4.3783333333333335e-06, "loss": 0.0063, "num_tokens": 644813.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 39.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.3445829451084137, "kl": 0.0319817503914237, "learning_rate": 4.377777777777778e-06, "loss": 0.0019, "num_tokens": 645075.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010204081423580647, "clip_ratio/low_min": 0.010204081423580647, "clip_ratio/region_mean": 0.010204081423580647, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.786332607269287, "kl": 0.05362135768518783, "learning_rate": 4.377222222222223e-06, "loss": 0.1915, "num_tokens": 645413.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 39.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.07419364899396896, "kl": 0.008118039229884744, "learning_rate": 4.3766666666666665e-06, "loss": 0.0004, "num_tokens": 645677.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 39.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.014012793079018593, "kl": 0.0015453733503818512, "learning_rate": 4.376111111111112e-06, "loss": 0.0001, "num_tokens": 645921.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 39.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.33034175634384155, "kl": 0.03588105272501707, "learning_rate": 4.375555555555555e-06, "loss": 0.0018, "num_tokens": 646238.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 4.668802261352539, "kl": 0.025407645385712385, "learning_rate": 4.3750000000000005e-06, "loss": 0.2818, "num_tokens": 646553.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 39.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.15036959946155548, "kl": 0.09696263447403908, "learning_rate": 4.374444444444445e-06, "loss": 0.0048, "num_tokens": 646880.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.010416666977107525, "clip_ratio/region_mean": 0.010416666977107525, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 4.634558200836182, "kl": 0.11633483320474625, "learning_rate": 4.373888888888889e-06, "loss": 0.0076, "num_tokens": 647185.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 39.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032722381874918938, "kl": 0.0023796764435246587, "learning_rate": 4.3733333333333335e-06, "loss": 0.0001, "num_tokens": 647469.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 6.36565637588501, "kl": 0.04505135305225849, "learning_rate": 4.372777777777778e-06, "loss": 0.0417, "num_tokens": 647769.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12275102734565735, "kl": 0.038451073691248894, "learning_rate": 4.372222222222223e-06, "loss": 0.0019, "num_tokens": 648037.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 39.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06237746775150299, "kl": 0.005723561393097043, "learning_rate": 4.371666666666667e-06, "loss": 0.0003, "num_tokens": 648291.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 39.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05870429798960686, "kl": 0.006867412477731705, "learning_rate": 4.371111111111112e-06, "loss": 0.0003, "num_tokens": 648551.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.006004604510962963, "kl": 0.00017003076209221035, "learning_rate": 4.370555555555555e-06, "loss": 0.0, "num_tokens": 648831.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 39.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017616304103285074, "kl": 2.6434659957885742e-05, "learning_rate": 4.3700000000000005e-06, "loss": 0.0, "num_tokens": 649051.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.053749315440654755, "kl": 0.14130394905805588, "learning_rate": 4.369444444444445e-06, "loss": 0.0069, "num_tokens": 649366.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 39.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 2.0407509803771973, "kl": 0.07669747248291969, "learning_rate": 4.368888888888889e-06, "loss": -0.16, "num_tokens": 649800.0, "reward": 1.125, "reward_std": 1.25, "rewards/reward_combined/mean": 1.125, "rewards/reward_combined/std": 1.25, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.10245373845100403, "kl": 0.011186012881807983, "learning_rate": 4.368333333333334e-06, "loss": 0.0006, "num_tokens": 650090.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 39.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.02894514799118042, "kl": 0.001731373369693756, "learning_rate": 4.367777777777778e-06, "loss": 0.0001, "num_tokens": 650300.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 39.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 3.0889670848846436, "kl": 0.028472429141402245, "learning_rate": 4.367222222222222e-06, "loss": -0.0185, "num_tokens": 650588.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.75, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 39.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 2.4243545532226562, "kl": 0.039467222057282925, "learning_rate": 4.366666666666667e-06, "loss": 0.2284, "num_tokens": 650987.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 39.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 1.991183876991272, "kl": 0.04635614436119795, "learning_rate": 4.366111111111112e-06, "loss": 0.0003, "num_tokens": 651322.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 39.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.1156369149684906, "kl": 0.01456811185926199, "learning_rate": 4.365555555555555e-06, "loss": 0.0007, "num_tokens": 651650.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.034606728702783585, "kl": 0.001614159788005054, "learning_rate": 4.3650000000000006e-06, "loss": 0.0001, "num_tokens": 651913.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 39.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 5.747992038726807, "kl": 0.06559056788682938, "learning_rate": 4.364444444444445e-06, "loss": 0.2613, "num_tokens": 652219.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 39.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01657900959253311, "kl": 0.02827127929776907, "learning_rate": 4.363888888888889e-06, "loss": 0.0014, "num_tokens": 652514.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 39.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.026955164968967438, "kl": 0.0061028830241411924, "learning_rate": 4.363333333333334e-06, "loss": 0.0003, "num_tokens": 652814.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 9.548078536987305, "kl": 0.012555296532809734, "learning_rate": 4.362777777777778e-06, "loss": -0.0237, "num_tokens": 653087.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 39.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.1979154348373413, "kl": 0.13547807931900024, "learning_rate": 4.362222222222222e-06, "loss": -0.0041, "num_tokens": 653452.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 39.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 3.0926361083984375, "kl": 0.050982870161533356, "learning_rate": 4.361666666666667e-06, "loss": 0.2056, "num_tokens": 653780.0, "reward": 4.125, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.308422088623047, "step": 2150 }, { "clip_ratio/high_max": 0.011111111380159855, "clip_ratio/high_mean": 0.011111111380159855, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 39.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 3.0827138423919678, "kl": 0.14794518798589706, "learning_rate": 4.361111111111112e-06, "loss": 0.0877, "num_tokens": 654140.0, "reward": 2.125, "reward_std": 1.314977765083313, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 1.3149778842926025, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 39.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.008486122824251652, "kl": 0.0014969855546951294, "learning_rate": 4.360555555555555e-06, "loss": 0.0001, "num_tokens": 654352.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 39.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.10422572493553162, "kl": 0.00692966184578836, "learning_rate": 4.360000000000001e-06, "loss": 0.0004, "num_tokens": 654624.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 39.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 4.979980945587158, "kl": 0.18174101784825325, "learning_rate": 4.359444444444445e-06, "loss": 0.0324, "num_tokens": 654922.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 39.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.3109936714172363, "kl": 0.045198567444458604, "learning_rate": 4.358888888888889e-06, "loss": 0.1336, "num_tokens": 655242.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 39.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.2824273109436035, "kl": 0.020612530410289764, "learning_rate": 4.358333333333334e-06, "loss": 0.001, "num_tokens": 655454.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 39.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.9289960861206055, "kl": 0.06757305935025215, "learning_rate": 4.357777777777778e-06, "loss": -0.1523, "num_tokens": 655772.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 39.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 11.984498977661133, "kl": 0.15157779306173325, "learning_rate": 4.357222222222222e-06, "loss": -0.2205, "num_tokens": 655987.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 39.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09662385284900665, "kl": 0.00274884095415473, "learning_rate": 4.356666666666667e-06, "loss": 0.0001, "num_tokens": 656243.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022539214696735144, "kl": 0.010997481644153595, "learning_rate": 4.356111111111111e-06, "loss": 0.0005, "num_tokens": 656479.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1892387717962265, "kl": 0.07984986156225204, "learning_rate": 4.3555555555555555e-06, "loss": 0.0039, "num_tokens": 656753.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 40.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.0356311798095703, "kl": 0.2537289671599865, "learning_rate": 4.355000000000001e-06, "loss": 0.076, "num_tokens": 657014.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 87.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 87.5, "completions/mean_terminated_length": 31.33333396911621, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 40.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.1448686122894287, "kl": 0.05121664237231016, "learning_rate": 4.354444444444445e-06, "loss": 0.3978, "num_tokens": 657580.0, "reward": 2.674999952316284, "reward_std": 1.649999976158142, "rewards/reward_combined/mean": 2.674999952316284, "rewards/reward_combined/std": 1.649999976158142, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 40.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.08582310378551483, "kl": 0.0426955409348011, "learning_rate": 4.353888888888889e-06, "loss": 0.0021, "num_tokens": 657916.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 40.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.02946593426167965, "kl": 0.008310707286000252, "learning_rate": 4.353333333333334e-06, "loss": 0.0004, "num_tokens": 658228.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 40.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 4.1179046630859375, "kl": 0.09804044663906097, "learning_rate": 4.352777777777778e-06, "loss": 0.1573, "num_tokens": 658580.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016705175221432, "kl": 3.9950013160705566e-05, "learning_rate": 4.3522222222222224e-06, "loss": 0.0, "num_tokens": 658800.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 40.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 3.195303440093994, "kl": 0.2860847874544561, "learning_rate": 4.351666666666667e-06, "loss": 0.0143, "num_tokens": 659096.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 40.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 0.6203007698059082, "kl": 0.05815516412258148, "learning_rate": 4.351111111111111e-06, "loss": 0.0023, "num_tokens": 659558.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 40.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 3.921478271484375, "kl": 0.212716206908226, "learning_rate": 4.3505555555555555e-06, "loss": 0.2318, "num_tokens": 659893.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 40.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0766373947262764, "kl": 0.017702948302030563, "learning_rate": 4.350000000000001e-06, "loss": 0.0009, "num_tokens": 660237.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 40.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.033475689589977264, "kl": 0.0010635235521476716, "learning_rate": 4.349444444444445e-06, "loss": 0.0001, "num_tokens": 660471.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 40.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015998645685613155, "kl": 0.031797073781490326, "learning_rate": 4.348888888888889e-06, "loss": 0.0016, "num_tokens": 660739.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 40.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.028023023158311844, "kl": 0.004846865311264992, "learning_rate": 4.348333333333334e-06, "loss": 0.0002, "num_tokens": 660995.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 40.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.06286031007766724, "kl": 0.01736563676968217, "learning_rate": 4.347777777777778e-06, "loss": 0.0008, "num_tokens": 661333.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 40.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027216756716370583, "kl": 0.00014001131057739258, "learning_rate": 4.3472222222222225e-06, "loss": 0.0, "num_tokens": 661553.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.012500000186264515, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 40.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 3.970301628112793, "kl": 0.27732591703534126, "learning_rate": 4.346666666666667e-06, "loss": -0.0332, "num_tokens": 661861.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.02869250439107418, "kl": 0.02085826825350523, "learning_rate": 4.346111111111111e-06, "loss": 0.0011, "num_tokens": 662153.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 40.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.05872739478945732, "kl": 0.007796262390911579, "learning_rate": 4.3455555555555555e-06, "loss": 0.0004, "num_tokens": 662441.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 40.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.018753312528133392, "kl": 0.09058978781104088, "learning_rate": 4.345000000000001e-06, "loss": 0.0045, "num_tokens": 662806.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 40.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.04725540801882744, "kl": 0.22729551047086716, "learning_rate": 4.344444444444445e-06, "loss": 0.0113, "num_tokens": 663108.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 40.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.10226458311080933, "kl": 0.02929694950580597, "learning_rate": 4.3438888888888895e-06, "loss": 0.0015, "num_tokens": 663450.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 40.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.03481907770037651, "kl": 0.049810487776994705, "learning_rate": 4.343333333333334e-06, "loss": 0.0025, "num_tokens": 663782.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 40.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.009172097779810429, "kl": 0.0016911352286115289, "learning_rate": 4.342777777777778e-06, "loss": 0.0001, "num_tokens": 664054.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.032604694366455, "kl": 0.012280466966331005, "learning_rate": 4.3422222222222225e-06, "loss": 0.0393, "num_tokens": 664340.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.016767017543315887, "kl": 0.03347215801477432, "learning_rate": 4.341666666666667e-06, "loss": 0.0017, "num_tokens": 664556.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 40.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.944373607635498, "kl": 0.0701548233628273, "learning_rate": 4.341111111111111e-06, "loss": -0.0426, "num_tokens": 664906.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.002033388474956155, "kl": 0.011053182184696198, "learning_rate": 4.340555555555556e-06, "loss": 0.0006, "num_tokens": 665142.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 40.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.042596485465765, "kl": 0.0580104012042284, "learning_rate": 4.34e-06, "loss": 0.0029, "num_tokens": 665439.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058297039940953255, "kl": 0.019237096421420574, "learning_rate": 4.339444444444445e-06, "loss": 0.001, "num_tokens": 665711.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 40.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.026506569236516953, "kl": 0.0024029389023780823, "learning_rate": 4.3388888888888895e-06, "loss": 0.0001, "num_tokens": 665955.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 40.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.5230610370635986, "kl": 0.032041565515100956, "learning_rate": 4.338333333333334e-06, "loss": -0.0059, "num_tokens": 666270.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.06629522889852524, "kl": 0.005889627151191235, "learning_rate": 4.337777777777778e-06, "loss": 0.0003, "num_tokens": 666541.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 40.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.10398583114147186, "kl": 0.11899657174944878, "learning_rate": 4.337222222222223e-06, "loss": 0.0058, "num_tokens": 666857.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 40.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.008555039763450623, "kl": 0.0015734434127807617, "learning_rate": 4.336666666666667e-06, "loss": 0.0001, "num_tokens": 667069.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 40.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.14111530780792236, "kl": 0.11471991240978241, "learning_rate": 4.336111111111111e-06, "loss": 0.0057, "num_tokens": 667415.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 40.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.04193086922168732, "kl": 0.005682365503162146, "learning_rate": 4.3355555555555565e-06, "loss": 0.0003, "num_tokens": 667689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 40.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.009170031175017357, "kl": 0.008949515409767628, "learning_rate": 4.335e-06, "loss": 0.0004, "num_tokens": 667961.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 40.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.009854459203779697, "kl": 0.001371145248413086, "learning_rate": 4.334444444444445e-06, "loss": 0.0001, "num_tokens": 668173.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 40.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 8.817619323730469, "kl": 0.042744118720293045, "learning_rate": 4.3338888888888896e-06, "loss": -0.0057, "num_tokens": 668432.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 40.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05400708690285683, "kl": 0.002198360860347748, "learning_rate": 4.333333333333334e-06, "loss": 0.0001, "num_tokens": 668638.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2201 }, { "clip_ratio/high_max": 0.011111111380159855, "clip_ratio/high_mean": 0.011111111380159855, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 40.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 5.98787784576416, "kl": 0.40099385380744934, "learning_rate": 4.332777777777778e-06, "loss": 0.1264, "num_tokens": 668954.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 40.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.29809832572937, "kl": 0.010298696346580982, "learning_rate": 4.332222222222223e-06, "loss": 0.1347, "num_tokens": 669223.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 40.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.01407769974321127, "kl": 0.001388232340104878, "learning_rate": 4.331666666666667e-06, "loss": 0.0001, "num_tokens": 669541.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 40.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 2.206273317337036, "kl": 0.12945744395256042, "learning_rate": 4.331111111111111e-06, "loss": 0.0169, "num_tokens": 669838.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 40.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.06800013780593872, "kl": 0.021019854582846165, "learning_rate": 4.3305555555555565e-06, "loss": 0.0011, "num_tokens": 670101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 40.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.11502141505479813, "kl": 0.00802311979350634, "learning_rate": 4.33e-06, "loss": 0.0004, "num_tokens": 670375.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 40.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.024685969576239586, "kl": 0.0007162988185882568, "learning_rate": 4.329444444444445e-06, "loss": 0.0, "num_tokens": 670631.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 40.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0719156414270401, "kl": 0.006228957790881395, "learning_rate": 4.328888888888889e-06, "loss": 0.0003, "num_tokens": 670902.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 40.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.08090145140886307, "kl": 0.00920449011027813, "learning_rate": 4.328333333333334e-06, "loss": 0.0005, "num_tokens": 671216.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.026388827711343765, "kl": 0.00467345118522644, "learning_rate": 4.327777777777778e-06, "loss": 0.0002, "num_tokens": 671500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 40.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.15904709696769714, "kl": 0.020078353118151426, "learning_rate": 4.327222222222223e-06, "loss": 0.0011, "num_tokens": 671800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 40.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.13294589519500732, "kl": 0.0331199960783124, "learning_rate": 4.326666666666667e-06, "loss": 0.0017, "num_tokens": 672090.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.036805883049964905, "kl": 0.009425764437764883, "learning_rate": 4.326111111111111e-06, "loss": 0.0005, "num_tokens": 672388.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.001448125927709043, "kl": 0.011150792241096497, "learning_rate": 4.325555555555557e-06, "loss": 0.0006, "num_tokens": 672624.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 41.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008163271471858025, "kl": 0.0019264817237854004, "learning_rate": 4.325e-06, "loss": 0.0001, "num_tokens": 672836.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 41.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.010309258475899696, "kl": 0.001953825354576111, "learning_rate": 4.324444444444445e-06, "loss": 0.0001, "num_tokens": 673108.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 41.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.10612411051988602, "kl": 0.023919325321912766, "learning_rate": 4.323888888888889e-06, "loss": 0.0012, "num_tokens": 673428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 41.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.06166279315948486, "kl": 0.03142641205340624, "learning_rate": 4.323333333333334e-06, "loss": 0.0014, "num_tokens": 673758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 41.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.00485161691904068, "kl": 0.0012698322534561157, "learning_rate": 4.322777777777778e-06, "loss": 0.0001, "num_tokens": 674018.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 41.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 3.1145472526550293, "kl": 0.21777945011854172, "learning_rate": 4.322222222222223e-06, "loss": -0.1078, "num_tokens": 674337.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 41.148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 21.38892936706543, "kl": 4.032417647540569, "learning_rate": 4.321666666666667e-06, "loss": 0.2015, "num_tokens": 674701.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 41.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.10166177153587341, "kl": 0.015566065907478333, "learning_rate": 4.3211111111111114e-06, "loss": 0.0008, "num_tokens": 674963.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 41.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 6.035851001739502, "kl": 0.10547785274684429, "learning_rate": 4.320555555555556e-06, "loss": -0.0406, "num_tokens": 675225.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 41.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.599395751953125, "kl": 0.05397570040076971, "learning_rate": 4.32e-06, "loss": 0.0882, "num_tokens": 675529.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 41.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.3381249010562897, "kl": 0.03351924940943718, "learning_rate": 4.319444444444445e-06, "loss": 0.0017, "num_tokens": 675801.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 41.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.205280601978302, "kl": 0.012311721220612526, "learning_rate": 4.318888888888889e-06, "loss": 0.0006, "num_tokens": 676105.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 41.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.28568312525749207, "kl": 0.07458744756877422, "learning_rate": 4.318333333333334e-06, "loss": 0.0037, "num_tokens": 676437.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 41.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.09859422594308853, "kl": 0.010110905859619379, "learning_rate": 4.3177777777777776e-06, "loss": 0.0005, "num_tokens": 676695.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 41.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.929573059082031, "kl": 0.04895323887467384, "learning_rate": 4.317222222222223e-06, "loss": 0.0784, "num_tokens": 676995.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 41.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.12397951632738113, "kl": 0.012396838632412255, "learning_rate": 4.316666666666667e-06, "loss": 0.0006, "num_tokens": 677229.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 41.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.026362193748354912, "kl": 0.0005384623946156353, "learning_rate": 4.3161111111111115e-06, "loss": 0.0, "num_tokens": 677485.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 41.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.7206133604049683, "kl": 0.18455208837985992, "learning_rate": 4.315555555555556e-06, "loss": 0.0073, "num_tokens": 677777.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 41.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.04395926743745804, "kl": 0.0028948935796506703, "learning_rate": 4.315e-06, "loss": 0.0001, "num_tokens": 678051.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 41.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 2.917785406112671, "kl": 0.06690246611833572, "learning_rate": 4.314444444444445e-06, "loss": -0.0566, "num_tokens": 678408.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 51.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 41.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.09463395923376083, "kl": 0.10251831263303757, "learning_rate": 4.313888888888889e-06, "loss": 0.0051, "num_tokens": 678892.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 54.75, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 41.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 2.00931715965271, "kl": 0.10591451823711395, "learning_rate": 4.313333333333334e-06, "loss": 0.3635, "num_tokens": 679347.0, "reward": 3.125, "reward_std": 0.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 0.75, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 41.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.6512320041656494, "kl": 0.002232499187812209, "learning_rate": 4.312777777777778e-06, "loss": 0.0141, "num_tokens": 679669.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 41.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1097194105386734, "kl": 0.02164249401539564, "learning_rate": 4.312222222222223e-06, "loss": 0.0015, "num_tokens": 679925.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 41.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01902916096150875, "kl": 0.0426630936563015, "learning_rate": 4.311666666666667e-06, "loss": 0.0021, "num_tokens": 680218.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 41.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04052285477519035, "kl": 0.10147438570857048, "learning_rate": 4.3111111111111115e-06, "loss": 0.005, "num_tokens": 680527.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 41.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10780075937509537, "kl": 0.030222715809941292, "learning_rate": 4.310555555555556e-06, "loss": 0.0016, "num_tokens": 680875.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 41.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08356369286775589, "kl": 0.03455536812543869, "learning_rate": 4.31e-06, "loss": 0.0017, "num_tokens": 681143.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 41.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.062430694699287415, "kl": 0.005287828855216503, "learning_rate": 4.309444444444445e-06, "loss": 0.0003, "num_tokens": 681457.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 41.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.09675279259681702, "kl": 0.0238949004560709, "learning_rate": 4.308888888888889e-06, "loss": 0.0012, "num_tokens": 681752.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 41.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.13046808540821075, "kl": 0.016701119020581245, "learning_rate": 4.308333333333334e-06, "loss": 0.0008, "num_tokens": 682018.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 41.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 3.5313966274261475, "kl": 0.05627114325761795, "learning_rate": 4.307777777777778e-06, "loss": -0.2058, "num_tokens": 682390.0, "reward": 6.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.674234628677368, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 41.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.09034192562103271, "kl": 0.011750052217394114, "learning_rate": 4.307222222222223e-06, "loss": 0.0006, "num_tokens": 682680.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 41.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.001917217974551022, "kl": 0.018259422853589058, "learning_rate": 4.306666666666666e-06, "loss": 0.0009, "num_tokens": 682940.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 41.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.002359213773161173, "kl": 0.0018438739934936166, "learning_rate": 4.3061111111111116e-06, "loss": 0.0001, "num_tokens": 683220.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.026515165343880653, "kl": 0.0017893016338348389, "learning_rate": 4.305555555555556e-06, "loss": 0.0001, "num_tokens": 683432.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.029013633728027, "kl": 0.011849640402942896, "learning_rate": 4.305e-06, "loss": 0.0004, "num_tokens": 683716.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 41.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.738858222961426, "kl": 0.006593338213860989, "learning_rate": 4.304444444444445e-06, "loss": 0.0431, "num_tokens": 684031.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 41.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04493620991706848, "kl": 0.0007495135068893433, "learning_rate": 4.303888888888889e-06, "loss": 0.0, "num_tokens": 684235.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.012311289086937904, "kl": 0.002764997538179159, "learning_rate": 4.303333333333334e-06, "loss": 0.0001, "num_tokens": 684519.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 41.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.0842013359069824, "kl": 0.21524912118911743, "learning_rate": 4.302777777777778e-06, "loss": 0.2628, "num_tokens": 684855.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 41.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.9574642181396484, "kl": 0.28790270909667015, "learning_rate": 4.302222222222223e-06, "loss": -0.2341, "num_tokens": 685259.0, "reward": -0.25, "reward_std": 1.5, "rewards/reward_combined/mean": -0.25, "rewards/reward_combined/std": 1.5, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 41.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 7.623130798339844, "kl": 0.23879460990428925, "learning_rate": 4.301666666666666e-06, "loss": -0.0483, "num_tokens": 685549.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.01147827785462141, "kl": 0.01732703484594822, "learning_rate": 4.301111111111112e-06, "loss": 0.0009, "num_tokens": 685821.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 41.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 3.7173686027526855, "kl": 0.12436195090413094, "learning_rate": 4.300555555555556e-06, "loss": -0.0331, "num_tokens": 686158.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 41.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.06795478612184525, "kl": 0.011245336849242449, "learning_rate": 4.3e-06, "loss": 0.0006, "num_tokens": 686450.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.012646314688026905, "kl": 0.034624986350536346, "learning_rate": 4.299444444444445e-06, "loss": 0.0017, "num_tokens": 686666.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 41.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.4711360931396484, "kl": 0.052400749176740646, "learning_rate": 4.298888888888889e-06, "loss": -0.12, "num_tokens": 686985.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 41.925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 1.587382197380066, "kl": 0.006777544040232897, "learning_rate": 4.298333333333333e-06, "loss": -0.0198, "num_tokens": 687315.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 41.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09722775220870972, "kl": 0.018815718591213226, "learning_rate": 4.297777777777778e-06, "loss": 0.001, "num_tokens": 687580.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015762732073199004, "kl": 4.5903027057647705e-05, "learning_rate": 4.297222222222223e-06, "loss": 0.0, "num_tokens": 687800.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 41.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.358100414276123, "kl": 0.14382750843651593, "learning_rate": 4.2966666666666665e-06, "loss": 0.0025, "num_tokens": 688019.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 42.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04936422407627106, "kl": 0.005101040005683899, "learning_rate": 4.296111111111112e-06, "loss": 0.0003, "num_tokens": 688279.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 42.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.5078396797180176, "kl": 0.038324784487485886, "learning_rate": 4.295555555555556e-06, "loss": 0.0903, "num_tokens": 688593.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 42.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.8905832767486572, "kl": 0.1454956978559494, "learning_rate": 4.295e-06, "loss": 0.1951, "num_tokens": 688922.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 42.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 10.163788795471191, "kl": 0.051508355885744095, "learning_rate": 4.294444444444445e-06, "loss": 0.1746, "num_tokens": 689134.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 42.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.05531148239970207, "kl": 0.004705801606178284, "learning_rate": 4.293888888888889e-06, "loss": 0.0002, "num_tokens": 689346.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 42.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.01308757346123457, "kl": 0.011493874713778496, "learning_rate": 4.2933333333333334e-06, "loss": 0.0006, "num_tokens": 689606.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 42.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.08436547964811325, "kl": 0.13360636308789253, "learning_rate": 4.292777777777778e-06, "loss": 0.0064, "num_tokens": 689906.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 42.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.07849401980638504, "kl": 0.0325674107298255, "learning_rate": 4.292222222222223e-06, "loss": 0.0016, "num_tokens": 690223.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 42.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.052186284214258194, "kl": 0.02595239970833063, "learning_rate": 4.2916666666666665e-06, "loss": 0.0013, "num_tokens": 690551.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 42.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.03733420744538307, "kl": 0.0036786795826628804, "learning_rate": 4.291111111111112e-06, "loss": 0.0002, "num_tokens": 690821.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 42.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 2.417738914489746, "kl": 0.13634514808654785, "learning_rate": 4.290555555555556e-06, "loss": -0.0309, "num_tokens": 691295.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 78.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 78.75, "completions/mean_terminated_length": 19.666667938232422, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 42.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.12581205368042, "kl": 0.05213590478524566, "learning_rate": 4.2900000000000004e-06, "loss": 0.5296, "num_tokens": 691830.0, "reward": 5.675000190734863, "reward_std": 2.2186708450317383, "rewards/reward_combined/mean": 5.675000190734863, "rewards/reward_combined/std": 2.2186708450317383, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 42.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.082194022834301, "kl": 0.004601437598466873, "learning_rate": 4.289444444444445e-06, "loss": 0.0002, "num_tokens": 692090.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 42.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.037094298750162125, "kl": 0.007843740284442902, "learning_rate": 4.288888888888889e-06, "loss": 0.0004, "num_tokens": 692342.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 42.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.5503538250923157, "kl": 0.09805011190474033, "learning_rate": 4.2883333333333335e-06, "loss": 0.0052, "num_tokens": 692638.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 42.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.006583348382264376, "kl": 0.0031940987100824714, "learning_rate": 4.287777777777778e-06, "loss": 0.0002, "num_tokens": 692950.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.194696843624115, "kl": 0.028873255476355553, "learning_rate": 4.287222222222222e-06, "loss": 0.0016, "num_tokens": 693275.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 42.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.048441726714372635, "kl": 0.04260704480111599, "learning_rate": 4.2866666666666666e-06, "loss": 0.0022, "num_tokens": 693574.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 42.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 1.2182410955429077, "kl": 0.3153347671031952, "learning_rate": 4.286111111111112e-06, "loss": 0.015, "num_tokens": 693864.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 42.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.001548115978948772, "kl": 0.018371972255408764, "learning_rate": 4.285555555555556e-06, "loss": 0.0009, "num_tokens": 694124.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.6095479726791382, "kl": 0.13672462105751038, "learning_rate": 4.2850000000000005e-06, "loss": 0.0068, "num_tokens": 694420.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 42.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016045225493144244, "kl": 4.6022236347198486e-05, "learning_rate": 4.284444444444445e-06, "loss": 0.0, "num_tokens": 694640.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.4140505790710449, "kl": 0.07915064133703709, "learning_rate": 4.283888888888889e-06, "loss": 0.0036, "num_tokens": 694922.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 42.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.026697855442762375, "kl": 0.009141004644334316, "learning_rate": 4.2833333333333335e-06, "loss": 0.0005, "num_tokens": 695230.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 42.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.1393895000219345, "kl": 0.05198722705245018, "learning_rate": 4.282777777777778e-06, "loss": 0.0026, "num_tokens": 695556.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06700775772333145, "kl": 0.11120892688632011, "learning_rate": 4.282222222222222e-06, "loss": 0.0054, "num_tokens": 695823.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 42.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.187509059906006, "kl": 0.0841010557487607, "learning_rate": 4.281666666666667e-06, "loss": -0.1768, "num_tokens": 696112.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 42.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796962171792984, "kl": 0.015355446841567755, "learning_rate": 4.281111111111112e-06, "loss": 0.0008, "num_tokens": 696374.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 42.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.15174180269241333, "kl": 0.007726295385509729, "learning_rate": 4.280555555555556e-06, "loss": 0.0004, "num_tokens": 696622.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 42.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01149027980864048, "kl": 0.03494753688573837, "learning_rate": 4.2800000000000005e-06, "loss": 0.0017, "num_tokens": 696838.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 42.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.055817507207393646, "kl": 0.022012368543073535, "learning_rate": 4.279444444444445e-06, "loss": 0.0011, "num_tokens": 697128.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 42.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0639386773109436, "kl": 0.008902129251509905, "learning_rate": 4.278888888888889e-06, "loss": 0.0003, "num_tokens": 697398.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 42.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.14092697203159332, "kl": 0.019440804433543235, "learning_rate": 4.278333333333334e-06, "loss": 0.0009, "num_tokens": 697616.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 42.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.07562453299760818, "kl": 0.010882650967687368, "learning_rate": 4.277777777777778e-06, "loss": 0.0006, "num_tokens": 697908.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.046078506857156754, "kl": 0.007296965457499027, "learning_rate": 4.277222222222222e-06, "loss": 0.0004, "num_tokens": 698181.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 42.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 3.8124096393585205, "kl": 0.055794334039092064, "learning_rate": 4.276666666666667e-06, "loss": 0.2187, "num_tokens": 698542.0, "reward": 6.375, "reward_std": 3.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 3.25, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 42.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 3.8017418384552, "kl": 0.05426994897425175, "learning_rate": 4.276111111111111e-06, "loss": -0.1253, "num_tokens": 698887.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 42.68518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 0.9336676001548767, "kl": 0.12716148793697357, "learning_rate": 4.275555555555556e-06, "loss": 0.0063, "num_tokens": 699251.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 42.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.007152729667723179, "kl": 0.002407357096672058, "learning_rate": 4.2750000000000006e-06, "loss": 0.0001, "num_tokens": 699463.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 42.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0885990560054779, "kl": 0.0066000878578051925, "learning_rate": 4.274444444444445e-06, "loss": 0.0003, "num_tokens": 699719.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 42.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.012220224365592003, "kl": 0.001300348900258541, "learning_rate": 4.273888888888889e-06, "loss": 0.0001, "num_tokens": 700036.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 42.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03301703557372093, "kl": 0.00979207269847393, "learning_rate": 4.273333333333334e-06, "loss": 0.0005, "num_tokens": 700348.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2309 }, { "clip_ratio/high_max": 0.008771929889917374, "clip_ratio/high_mean": 0.008771929889917374, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008771929889917374, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 42.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.033036470413208, "kl": 0.2076609879732132, "learning_rate": 4.272777777777778e-06, "loss": -0.1459, "num_tokens": 700691.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.025435328483581543, "kl": 0.02985827438533306, "learning_rate": 4.272222222222222e-06, "loss": 0.0015, "num_tokens": 700959.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 42.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.190512657165527, "kl": 0.22276408970355988, "learning_rate": 4.271666666666667e-06, "loss": 0.2266, "num_tokens": 701283.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 42.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.009196159429848194, "kl": 0.0012736357748508453, "learning_rate": 4.271111111111111e-06, "loss": 0.0001, "num_tokens": 701527.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 42.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.06626977771520615, "kl": 0.01958259893581271, "learning_rate": 4.270555555555556e-06, "loss": 0.0011, "num_tokens": 701865.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 42.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011654793052002788, "kl": 0.011221811175346375, "learning_rate": 4.270000000000001e-06, "loss": 0.0006, "num_tokens": 702101.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 42.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 4.073512554168701, "kl": 0.23708392679691315, "learning_rate": 4.269444444444445e-06, "loss": 0.1476, "num_tokens": 702436.0, "reward": 4.5, "reward_std": 2.0, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.0, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 42.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.01512904278934002, "kl": 0.002694413182325661, "learning_rate": 4.268888888888889e-06, "loss": 0.0001, "num_tokens": 702713.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 42.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.6082221865653992, "kl": 0.07697923947125673, "learning_rate": 4.268333333333334e-06, "loss": 0.0039, "num_tokens": 702995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 42.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.6864891052246094, "kl": 0.11581708490848541, "learning_rate": 4.267777777777778e-06, "loss": 0.0695, "num_tokens": 703349.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 42.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07474728673696518, "kl": 0.016612065955996513, "learning_rate": 4.267222222222222e-06, "loss": 0.0008, "num_tokens": 703650.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 42.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.3238447308540344, "kl": 0.27884455025196075, "learning_rate": 4.266666666666668e-06, "loss": 0.0139, "num_tokens": 703952.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 43.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04881248623132706, "kl": 0.0035973424382973462, "learning_rate": 4.266111111111111e-06, "loss": 0.0002, "num_tokens": 704186.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 43.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.194337397813797, "kl": 0.10564601421356201, "learning_rate": 4.265555555555556e-06, "loss": 0.0053, "num_tokens": 704500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 43.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0444914884865284, "kl": 0.028583886101841927, "learning_rate": 4.265000000000001e-06, "loss": 0.0014, "num_tokens": 704768.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 43.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 18.87333106994629, "kl": 0.2255486100912094, "learning_rate": 4.264444444444445e-06, "loss": 0.2365, "num_tokens": 704983.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 43.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 2.822744846343994, "kl": 0.06959019601345062, "learning_rate": 4.263888888888889e-06, "loss": 0.1114, "num_tokens": 705294.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 43.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.18278753757476807, "kl": 0.008030198514461517, "learning_rate": 4.263333333333334e-06, "loss": 0.0004, "num_tokens": 705506.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 43.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 5.536439895629883, "kl": 0.04647307936102152, "learning_rate": 4.262777777777778e-06, "loss": 0.0038, "num_tokens": 705846.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 43.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.11574971675872803, "kl": 0.018055476248264313, "learning_rate": 4.2622222222222224e-06, "loss": 0.0009, "num_tokens": 706108.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 43.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.2018791139125824, "kl": 0.016849741339683533, "learning_rate": 4.261666666666668e-06, "loss": 0.0009, "num_tokens": 706354.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 43.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 2.826795816421509, "kl": 0.01881422195583582, "learning_rate": 4.261111111111111e-06, "loss": 0.0006, "num_tokens": 706646.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 43.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 7.2493062019348145, "kl": 0.025223426520824432, "learning_rate": 4.260555555555556e-06, "loss": 0.1268, "num_tokens": 706926.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 53.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 43.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.051664963364601135, "kl": 0.10920482873916626, "learning_rate": 4.26e-06, "loss": 0.0055, "num_tokens": 707418.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.01889081485569477, "kl": 0.03504814952611923, "learning_rate": 4.259444444444445e-06, "loss": 0.0018, "num_tokens": 707702.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 43.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.5428135395050049, "kl": 0.047476448118686676, "learning_rate": 4.2588888888888894e-06, "loss": 0.0025, "num_tokens": 707967.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0060240961611270905, "clip_ratio/low_min": 0.0060240961611270905, "clip_ratio/region_mean": 0.0060240961611270905, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 43.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.8526556491851807, "kl": 0.022490539588034153, "learning_rate": 4.258333333333334e-06, "loss": 0.1938, "num_tokens": 708352.0, "reward": 2.5, "reward_std": 5.830951690673828, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 5.830951690673828, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.396486282348633, "kl": 0.04492104332894087, "learning_rate": 4.257777777777778e-06, "loss": 0.4056, "num_tokens": 708756.0, "reward": 4.800000190734863, "reward_std": 5.738176345825195, "rewards/reward_combined/mean": 4.800000190734863, "rewards/reward_combined/std": 5.738176345825195, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 43.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.762338876724243, "kl": 0.10946623980998993, "learning_rate": 4.2572222222222225e-06, "loss": -0.0636, "num_tokens": 709114.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 43.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.06988725066184998, "kl": 0.040194349363446236, "learning_rate": 4.256666666666668e-06, "loss": 0.002, "num_tokens": 709446.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 43.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032819665502756834, "kl": 0.0018758986261673272, "learning_rate": 4.256111111111111e-06, "loss": 0.0001, "num_tokens": 709726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 43.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.034032292664051056, "kl": 0.0015006095927674323, "learning_rate": 4.255555555555556e-06, "loss": 0.0001, "num_tokens": 709982.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 43.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.041189175099134445, "kl": 0.0066359578631818295, "learning_rate": 4.255e-06, "loss": 0.0003, "num_tokens": 710240.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 43.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.029814844951033592, "kl": 0.008504427969455719, "learning_rate": 4.254444444444445e-06, "loss": 0.0004, "num_tokens": 710552.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 43.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 5.22080135345459, "kl": 0.24224267154932022, "learning_rate": 4.2538888888888895e-06, "loss": 0.0692, "num_tokens": 710823.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 43.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013021735940128565, "kl": 0.01842240896075964, "learning_rate": 4.253333333333334e-06, "loss": 0.0009, "num_tokens": 711083.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 43.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.190102577209473, "kl": 0.2477792352437973, "learning_rate": 4.252777777777778e-06, "loss": -0.0214, "num_tokens": 711410.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 43.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.011419286951422691, "kl": 0.03511863946914673, "learning_rate": 4.2522222222222225e-06, "loss": 0.0018, "num_tokens": 711626.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 43.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.6609082221984863, "kl": 0.059065740555524826, "learning_rate": 4.251666666666667e-06, "loss": 0.1313, "num_tokens": 711936.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 43.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.2227834165096283, "kl": 0.04074045829474926, "learning_rate": 4.251111111111111e-06, "loss": 0.002, "num_tokens": 712193.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 43.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06264540553092957, "kl": 0.009631678462028503, "learning_rate": 4.2505555555555565e-06, "loss": 0.0005, "num_tokens": 712453.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 43.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.004143028520047665, "kl": 0.00018942143651656806, "learning_rate": 4.25e-06, "loss": 0.0, "num_tokens": 712733.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 43.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.029927272349596024, "kl": 0.004111191432457417, "learning_rate": 4.249444444444445e-06, "loss": 0.0002, "num_tokens": 712993.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 43.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 3.7843124866485596, "kl": 0.1435767412185669, "learning_rate": 4.248888888888889e-06, "loss": -0.0259, "num_tokens": 713329.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006493506487458944, "clip_ratio/low_min": 0.006493506487458944, "clip_ratio/region_mean": 0.006493506487458944, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 43.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.592881441116333, "kl": 0.1022641621530056, "learning_rate": 4.248333333333334e-06, "loss": 0.0839, "num_tokens": 713697.0, "reward": 3.875, "reward_std": 2.688710927963257, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 2.688710927963257, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 43.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.060661811381578445, "kl": 0.005666135577484965, "learning_rate": 4.247777777777778e-06, "loss": 0.0003, "num_tokens": 714023.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 43.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015543433837592602, "kl": 0.00021465335157699883, "learning_rate": 4.247222222222223e-06, "loss": 0.0, "num_tokens": 714257.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 43.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.037664998322725296, "kl": 0.22643277794122696, "learning_rate": 4.246666666666667e-06, "loss": 0.0113, "num_tokens": 714559.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 43.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017235943232662976, "kl": 3.901869058609009e-05, "learning_rate": 4.246111111111111e-06, "loss": 0.0, "num_tokens": 714779.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.053681790828704834, "kl": 0.00873816222883761, "learning_rate": 4.2455555555555565e-06, "loss": 0.0004, "num_tokens": 715063.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 43.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019428206142038107, "kl": 0.011076711118221283, "learning_rate": 4.245e-06, "loss": 0.0006, "num_tokens": 715299.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 43.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.08207514882087708, "kl": 0.01260195707436651, "learning_rate": 4.244444444444445e-06, "loss": 0.0006, "num_tokens": 715626.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06770733743906021, "kl": 0.010428800247609615, "learning_rate": 4.243888888888889e-06, "loss": 0.0005, "num_tokens": 715919.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.385221481323242, "kl": 0.013163601979613304, "learning_rate": 4.243333333333334e-06, "loss": 0.0693, "num_tokens": 716199.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 43.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.033912450075149536, "kl": 0.00477909785695374, "learning_rate": 4.242777777777778e-06, "loss": 0.0003, "num_tokens": 716499.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 43.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.8336949348449707, "kl": 0.059874407947063446, "learning_rate": 4.242222222222223e-06, "loss": 0.1436, "num_tokens": 716836.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 43.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 1.514426589012146, "kl": 0.21862135082483292, "learning_rate": 4.241666666666667e-06, "loss": 0.0121, "num_tokens": 717049.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 43.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.07965529710054398, "kl": 0.042401198064908385, "learning_rate": 4.241111111111111e-06, "loss": 0.0021, "num_tokens": 717341.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 43.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 3.8531229496002197, "kl": 0.29724863916635513, "learning_rate": 4.240555555555556e-06, "loss": -0.1004, "num_tokens": 717678.0, "reward": 1.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 43.870370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 4.415518283843994, "kl": 0.18813691288232803, "learning_rate": 4.24e-06, "loss": 0.0829, "num_tokens": 717980.0, "reward": 3.5, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 4.690415859222412, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 43.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.021491460502147675, "kl": 0.0005691374390153214, "learning_rate": 4.239444444444445e-06, "loss": 0.0, "num_tokens": 718250.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 43.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.09115609526634216, "kl": 0.03880997747182846, "learning_rate": 4.238888888888889e-06, "loss": 0.002, "num_tokens": 718584.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 43.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031145629473030567, "kl": 0.0006872713565826416, "learning_rate": 4.238333333333334e-06, "loss": 0.0, "num_tokens": 718804.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 43.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.8676395416259766, "kl": 0.3158711567521095, "learning_rate": 4.2377777777777775e-06, "loss": 0.305, "num_tokens": 719146.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 43.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1587943285703659, "kl": 0.020713869016617537, "learning_rate": 4.237222222222223e-06, "loss": 0.001, "num_tokens": 719455.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2374 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 43.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.826180934906006, "kl": 0.08012931235134602, "learning_rate": 4.236666666666667e-06, "loss": -0.0089, "num_tokens": 719749.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.042767081409692764, "kl": 0.014860046794638038, "learning_rate": 4.236111111111111e-06, "loss": 0.0007, "num_tokens": 720045.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 44.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015532594406977296, "kl": 0.0907326489686966, "learning_rate": 4.235555555555556e-06, "loss": 0.0045, "num_tokens": 720409.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 44.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.1416923999786377, "kl": 0.1506350040435791, "learning_rate": 4.235e-06, "loss": -0.0184, "num_tokens": 720758.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024271667934954166, "kl": 0.010970167815685272, "learning_rate": 4.234444444444445e-06, "loss": 0.0005, "num_tokens": 720994.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 44.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.2387654036283493, "kl": 0.04702715761959553, "learning_rate": 4.233888888888889e-06, "loss": 0.0024, "num_tokens": 721326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 44.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 1.2276571989059448, "kl": 0.1998090765264351, "learning_rate": 4.233333333333334e-06, "loss": 0.0158, "num_tokens": 721577.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2381 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.012820512987673283, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 44.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 4.945544242858887, "kl": 0.41774478554725647, "learning_rate": 4.2327777777777775e-06, "loss": 0.0612, "num_tokens": 721877.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 44.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.029298564419150352, "kl": 0.0009170472621917725, "learning_rate": 4.232222222222223e-06, "loss": 0.0, "num_tokens": 722147.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 44.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.11079012602567673, "kl": 0.05550902523100376, "learning_rate": 4.231666666666667e-06, "loss": 0.0028, "num_tokens": 722443.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 44.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.4578103721141815, "kl": 0.0663326159119606, "learning_rate": 4.2311111111111114e-06, "loss": 0.0037, "num_tokens": 722780.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 44.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.05430198833346367, "kl": 0.0022178529325174168, "learning_rate": 4.230555555555556e-06, "loss": 0.0001, "num_tokens": 723037.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 44.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.12580958008766174, "kl": 0.02891911193728447, "learning_rate": 4.23e-06, "loss": 0.0015, "num_tokens": 723380.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 44.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.032963212579488754, "kl": 0.005026595667004585, "learning_rate": 4.2294444444444445e-06, "loss": 0.0002, "num_tokens": 723679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 44.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.030009306967258453, "kl": 0.008858350571244955, "learning_rate": 4.228888888888889e-06, "loss": 0.0004, "num_tokens": 723941.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 44.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.979531764984131, "kl": 0.7163527011871338, "learning_rate": 4.228333333333334e-06, "loss": 0.1073, "num_tokens": 724212.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 44.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.1677066832780838, "kl": 0.03121437132358551, "learning_rate": 4.227777777777778e-06, "loss": 0.0016, "num_tokens": 724416.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 44.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0667540431022644, "kl": 0.025251124054193497, "learning_rate": 4.227222222222223e-06, "loss": 0.0013, "num_tokens": 724732.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 44.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.5652971267700195, "kl": 0.10468707978725433, "learning_rate": 4.226666666666667e-06, "loss": 0.0054, "num_tokens": 725026.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 44.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 4.0574188232421875, "kl": 0.0739167109131813, "learning_rate": 4.2261111111111115e-06, "loss": 0.0049, "num_tokens": 725324.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 44.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 5.240999698638916, "kl": 0.16713989526033401, "learning_rate": 4.225555555555556e-06, "loss": 0.0989, "num_tokens": 725670.0, "reward": 4.75, "reward_std": 4.092676162719727, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 4.092676162719727, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 44.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 6.5554304122924805, "kl": 0.09159190766513348, "learning_rate": 4.225e-06, "loss": 0.1118, "num_tokens": 725963.0, "reward": 6.625, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.0966243743896484, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 44.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.03083231672644615, "kl": 0.019355669617652893, "learning_rate": 4.2244444444444446e-06, "loss": 0.001, "num_tokens": 726254.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001730266521917656, "kl": 3.091245889663696e-05, "learning_rate": 4.223888888888889e-06, "loss": 0.0, "num_tokens": 726474.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 44.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.044893402606248856, "kl": 0.003032621636521071, "learning_rate": 4.223333333333334e-06, "loss": 0.0001, "num_tokens": 726752.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 44.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05910126864910126, "kl": 0.010152839589864016, "learning_rate": 4.222777777777778e-06, "loss": 0.0005, "num_tokens": 727085.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 44.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.9350624084472656, "kl": 0.08894626423716545, "learning_rate": 4.222222222222223e-06, "loss": -0.0474, "num_tokens": 727393.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.023116057738661766, "kl": 0.0038778134621679783, "learning_rate": 4.221666666666667e-06, "loss": 0.0002, "num_tokens": 727670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 44.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012879632413387299, "kl": 0.018417646177113056, "learning_rate": 4.2211111111111115e-06, "loss": 0.0009, "num_tokens": 727930.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 44.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0074869669042527676, "kl": 0.0023431479930877686, "learning_rate": 4.220555555555556e-06, "loss": 0.0001, "num_tokens": 728142.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 44.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02658437006175518, "kl": 0.007753283716738224, "learning_rate": 4.22e-06, "loss": 0.0004, "num_tokens": 728454.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 44.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.004090797621756792, "kl": 0.00032429397106170654, "learning_rate": 4.219444444444445e-06, "loss": 0.0, "num_tokens": 728698.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 44.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.009891826659440994, "kl": 0.0013118580682203174, "learning_rate": 4.218888888888889e-06, "loss": 0.0001, "num_tokens": 729016.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 44.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.09824906289577484, "kl": 0.03530384786427021, "learning_rate": 4.218333333333333e-06, "loss": 0.0017, "num_tokens": 729346.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 44.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.03608279675245285, "kl": 0.21387039124965668, "learning_rate": 4.217777777777778e-06, "loss": 0.0107, "num_tokens": 729650.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 44.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.18216684460639954, "kl": 0.11225807666778564, "learning_rate": 4.217222222222223e-06, "loss": 0.0055, "num_tokens": 730103.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014705882407724857, "clip_ratio/low_min": 0.014705882407724857, "clip_ratio/region_mean": 0.014705882407724857, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 44.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 10.171411514282227, "kl": 0.03336090501397848, "learning_rate": 4.216666666666667e-06, "loss": 0.2491, "num_tokens": 730385.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.7178643345832825, "kl": 0.12677130848169327, "learning_rate": 4.216111111111112e-06, "loss": 0.0067, "num_tokens": 730655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.010446781292557716, "kl": 0.0009326284634880722, "learning_rate": 4.215555555555556e-06, "loss": 0.0, "num_tokens": 730951.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.022727273404598236, "clip_ratio/low_min": 0.022727273404598236, "clip_ratio/region_mean": 0.022727273404598236, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 44.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.587354898452759, "kl": 0.02362685650587082, "learning_rate": 4.215e-06, "loss": -0.011, "num_tokens": 731225.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 44.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.05528799816966057, "kl": 0.0032499507069587708, "learning_rate": 4.214444444444445e-06, "loss": 0.0002, "num_tokens": 731498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 44.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.042337942868471146, "kl": 0.004812851431779563, "learning_rate": 4.213888888888889e-06, "loss": 0.0002, "num_tokens": 731810.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.013658326119184494, "kl": 0.03383994102478027, "learning_rate": 4.213333333333333e-06, "loss": 0.0017, "num_tokens": 732026.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 44.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.014588520862162113, "kl": 0.035614825785160065, "learning_rate": 4.212777777777778e-06, "loss": 0.0018, "num_tokens": 732310.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 44.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.032966136932373, "kl": 0.10868346691131592, "learning_rate": 4.212222222222223e-06, "loss": 0.117, "num_tokens": 732715.0, "reward": 2.75, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.190238118171692, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 44.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.034715063869953156, "kl": 0.003978398395702243, "learning_rate": 4.211666666666667e-06, "loss": 0.0002, "num_tokens": 732975.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.009182914160192013, "kl": 0.00029093027114868164, "learning_rate": 4.211111111111112e-06, "loss": 0.0, "num_tokens": 733187.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 44.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.06008267030119896, "kl": 0.006870675832033157, "learning_rate": 4.210555555555556e-06, "loss": 0.0003, "num_tokens": 733447.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 44.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.1444837599992752, "kl": 0.03262383909896016, "learning_rate": 4.21e-06, "loss": 0.0016, "num_tokens": 733792.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 44.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0531405434012413, "kl": 0.008239237125962973, "learning_rate": 4.209444444444445e-06, "loss": 0.0004, "num_tokens": 734076.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 44.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.06277952343225479, "kl": 0.005411464720964432, "learning_rate": 4.208888888888889e-06, "loss": 0.0003, "num_tokens": 734332.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 44.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.02592477761209011, "kl": 0.0015998855524230748, "learning_rate": 4.208333333333333e-06, "loss": 0.0001, "num_tokens": 734551.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 44.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.7583529949188232, "kl": 0.13736892584711313, "learning_rate": 4.207777777777778e-06, "loss": 0.0062, "num_tokens": 734851.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 44.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.2786293029785156, "kl": 0.06676328554749489, "learning_rate": 4.207222222222222e-06, "loss": 0.1055, "num_tokens": 735219.0, "reward": 2.375, "reward_std": 2.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 2.25, "step": 2428 }, { "clip_ratio/high_max": 0.006666666828095913, "clip_ratio/high_mean": 0.006666666828095913, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006666666828095913, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 44.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.3669283390045166, "kl": 0.05728482827544212, "learning_rate": 4.206666666666667e-06, "loss": -0.2052, "num_tokens": 735556.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.12042137235403061, "kl": 0.027300288900732994, "learning_rate": 4.206111111111112e-06, "loss": 0.0014, "num_tokens": 735832.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 45.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.060343094170093536, "kl": 0.030382078140974045, "learning_rate": 4.205555555555556e-06, "loss": 0.0015, "num_tokens": 736100.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 45.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06546079367399216, "kl": 0.021628119982779026, "learning_rate": 4.205e-06, "loss": 0.0011, "num_tokens": 736432.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2432 }, { "clip_ratio/high_max": 0.04545454680919647, "clip_ratio/high_mean": 0.04545454680919647, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04545454680919647, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 6.302450180053711, "kl": 0.2419438660144806, "learning_rate": 4.204444444444445e-06, "loss": -0.1187, "num_tokens": 736651.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 45.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0119628282263875, "kl": 0.0013003426138311625, "learning_rate": 4.203888888888889e-06, "loss": 0.0001, "num_tokens": 736971.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982363015413284, "kl": 0.009406683500856161, "learning_rate": 4.2033333333333335e-06, "loss": 0.0005, "num_tokens": 737251.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 45.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.007960902526974678, "kl": 0.0017013400793075562, "learning_rate": 4.202777777777778e-06, "loss": 0.0001, "num_tokens": 737463.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 45.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 3.419459819793701, "kl": 0.06600903533399105, "learning_rate": 4.202222222222222e-06, "loss": 0.0623, "num_tokens": 737791.0, "reward": 3.5, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 3.674234628677368, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 45.148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 9.33138370513916, "kl": 0.051925841718912125, "learning_rate": 4.201666666666667e-06, "loss": -0.0134, "num_tokens": 738065.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.002324198605492711, "kl": 0.0108872652053833, "learning_rate": 4.201111111111112e-06, "loss": 0.0005, "num_tokens": 738301.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 45.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.21667353808879852, "kl": 0.11198345571756363, "learning_rate": 4.200555555555556e-06, "loss": 0.0056, "num_tokens": 738614.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 45.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.3145764172077179, "kl": 0.04930100042838603, "learning_rate": 4.2000000000000004e-06, "loss": 0.0025, "num_tokens": 738922.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 45.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.819115161895752, "kl": 0.03072873316705227, "learning_rate": 4.199444444444445e-06, "loss": 0.0358, "num_tokens": 739183.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 45.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.12047115713357925, "kl": 0.03146929666399956, "learning_rate": 4.198888888888889e-06, "loss": 0.0016, "num_tokens": 739475.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 45.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0486835353076458, "kl": 0.004377327859401703, "learning_rate": 4.1983333333333335e-06, "loss": 0.0002, "num_tokens": 739719.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 45.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.19269941747188568, "kl": 0.03672148194164038, "learning_rate": 4.197777777777779e-06, "loss": 0.0022, "num_tokens": 739997.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 45.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.038023691624403, "kl": 0.004960166988894343, "learning_rate": 4.197222222222222e-06, "loss": 0.0002, "num_tokens": 740263.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 45.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 2.496164560317993, "kl": 0.05106687359511852, "learning_rate": 4.1966666666666674e-06, "loss": -0.1648, "num_tokens": 740626.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 45.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.47000911831855774, "kl": 0.08121144585311413, "learning_rate": 4.196111111111111e-06, "loss": 0.004, "num_tokens": 740976.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.017635991796851158, "kl": 0.0008914843201637268, "learning_rate": 4.195555555555556e-06, "loss": 0.0, "num_tokens": 741188.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 45.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031003649346530437, "kl": 0.017946289852261543, "learning_rate": 4.1950000000000005e-06, "loss": 0.0009, "num_tokens": 741448.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 45.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.03409595414996147, "kl": 0.007072507869452238, "learning_rate": 4.194444444444445e-06, "loss": 0.0003, "num_tokens": 741776.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 45.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.1196634769439697, "kl": 0.06070644222199917, "learning_rate": 4.193888888888889e-06, "loss": 0.0325, "num_tokens": 742074.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 45.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 4.8708815574646, "kl": 0.15868808329105377, "learning_rate": 4.1933333333333336e-06, "loss": -0.0534, "num_tokens": 742396.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 45.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0782703086733818, "kl": 0.14037798717617989, "learning_rate": 4.192777777777779e-06, "loss": 0.0069, "num_tokens": 742706.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 45.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.2675703167915344, "kl": 0.054110296070575714, "learning_rate": 4.192222222222222e-06, "loss": 0.0027, "num_tokens": 742997.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.0499157905578613, "kl": 0.026089471764862537, "learning_rate": 4.1916666666666675e-06, "loss": 0.0011, "num_tokens": 743281.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 45.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.017901059240102768, "kl": 0.0007845120853744447, "learning_rate": 4.191111111111111e-06, "loss": 0.0, "num_tokens": 743515.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 45.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.14225217700004578, "kl": 0.017775557935237885, "learning_rate": 4.190555555555556e-06, "loss": 0.0008, "num_tokens": 743721.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 45.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.405412197113037, "kl": 0.4010024890303612, "learning_rate": 4.1900000000000005e-06, "loss": 0.0379, "num_tokens": 744024.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 45.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.5373642444610596, "kl": 0.02545546949841082, "learning_rate": 4.189444444444445e-06, "loss": -0.0169, "num_tokens": 744295.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 45.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0593186654150486, "kl": 0.009029599255882204, "learning_rate": 4.188888888888889e-06, "loss": 0.0004, "num_tokens": 744551.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 45.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.028445923700928688, "kl": 0.007651487365365028, "learning_rate": 4.188333333333334e-06, "loss": 0.0004, "num_tokens": 744863.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 2.6674132347106934, "kl": 0.051194521598517895, "learning_rate": 4.187777777777778e-06, "loss": 0.0152, "num_tokens": 745154.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 45.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.02745758183300495, "kl": 0.004451077897101641, "learning_rate": 4.187222222222222e-06, "loss": 0.0002, "num_tokens": 745431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 45.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.04508591443300247, "kl": 0.05905391648411751, "learning_rate": 4.1866666666666675e-06, "loss": 0.003, "num_tokens": 745720.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 45.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 2.8913016319274902, "kl": 0.08984589949250221, "learning_rate": 4.186111111111111e-06, "loss": 0.022, "num_tokens": 746070.0, "reward": 4.5, "reward_std": 2.0, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.0, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.032928649336099625, "kl": 0.03170877322554588, "learning_rate": 4.185555555555556e-06, "loss": 0.0016, "num_tokens": 746354.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 45.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.010137248784303665, "kl": 0.000600174069404602, "learning_rate": 4.185000000000001e-06, "loss": 0.0, "num_tokens": 746626.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 45.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.1251058280467987, "kl": 0.11370474100112915, "learning_rate": 4.184444444444445e-06, "loss": 0.0052, "num_tokens": 746957.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 45.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.830164670944214, "kl": 0.4143544677644968, "learning_rate": 4.183888888888889e-06, "loss": 0.2338, "num_tokens": 747322.0, "reward": 5.0, "reward_std": 5.0, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 5.0, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 45.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.12412302196025848, "kl": 0.023902318440377712, "learning_rate": 4.183333333333334e-06, "loss": 0.0017, "num_tokens": 747711.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 45.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.002705648075789213, "kl": 0.09041459113359451, "learning_rate": 4.182777777777778e-06, "loss": 0.0045, "num_tokens": 748075.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 45.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.30386969447135925, "kl": 0.02075161933316849, "learning_rate": 4.182222222222222e-06, "loss": 0.001, "num_tokens": 748331.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016489846166223288, "kl": 3.8236379623413086e-05, "learning_rate": 4.1816666666666676e-06, "loss": 0.0, "num_tokens": 748551.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.046400394290685654, "kl": 0.0026277839206159115, "learning_rate": 4.181111111111111e-06, "loss": 0.0001, "num_tokens": 748770.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 45.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.025369850918650627, "kl": 0.0721223782747984, "learning_rate": 4.180555555555556e-06, "loss": 0.0035, "num_tokens": 749037.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 45.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.04950457811355591, "kl": 0.03028121031820774, "learning_rate": 4.18e-06, "loss": 0.0015, "num_tokens": 749335.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 45.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 2.3147785663604736, "kl": 0.05423046089708805, "learning_rate": 4.179444444444445e-06, "loss": 0.0036, "num_tokens": 749662.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 45.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.011583048850297928, "kl": 0.0015748313162475824, "learning_rate": 4.178888888888889e-06, "loss": 0.0001, "num_tokens": 749942.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 45.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.2840295732021332, "kl": 0.04918257612735033, "learning_rate": 4.178333333333334e-06, "loss": 0.0026, "num_tokens": 750301.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 45.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.1616344451904297, "kl": 0.04210103861987591, "learning_rate": 4.177777777777778e-06, "loss": 0.0248, "num_tokens": 750635.0, "reward": 5.0, "reward_std": 3.5590262413024902, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.5590262413024902, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 45.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.035784389823675156, "kl": 0.00280473823659122, "learning_rate": 4.177222222222222e-06, "loss": 0.0001, "num_tokens": 750946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 45.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 8.614908218383789, "kl": 0.1255622454918921, "learning_rate": 4.176666666666668e-06, "loss": 0.3117, "num_tokens": 751226.0, "reward": 3.0, "reward_std": 1.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 1.0, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 55.25, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 46.0, "frac_reward_zero_std": 0.0, "grad_norm": 8.51135540008545, "kl": 1.1117050983011723, "learning_rate": 4.176111111111111e-06, "loss": 0.0301, "num_tokens": 751727.0, "reward": 1.5499999523162842, "reward_std": 1.2556538581848145, "rewards/reward_combined/mean": 1.5499999523162842, "rewards/reward_combined/std": 1.2556538581848145, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 46.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001718157291179523, "kl": 3.4339725971221924e-05, "learning_rate": 4.175555555555556e-06, "loss": 0.0, "num_tokens": 751947.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 46.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.794783353805542, "kl": 0.008702837978489697, "learning_rate": 4.175e-06, "loss": 0.0941, "num_tokens": 752215.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 46.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.020786285400391, "kl": 0.28023721277713776, "learning_rate": 4.174444444444445e-06, "loss": -0.0051, "num_tokens": 752516.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 46.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.04178664833307266, "kl": 0.010704312473535538, "learning_rate": 4.173888888888889e-06, "loss": 0.0005, "num_tokens": 752818.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 46.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.011140561662614346, "kl": 0.002436425071209669, "learning_rate": 4.173333333333334e-06, "loss": 0.0001, "num_tokens": 753098.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 46.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.3538872003555298, "kl": 0.06412235274910927, "learning_rate": 4.172777777777778e-06, "loss": 0.0032, "num_tokens": 753386.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 46.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.10016913712024689, "kl": 0.0075552985072135925, "learning_rate": 4.1722222222222225e-06, "loss": 0.0004, "num_tokens": 753592.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 46.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.03629341721534729, "kl": 0.03254210017621517, "learning_rate": 4.171666666666667e-06, "loss": 0.0016, "num_tokens": 753924.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 46.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.003664012998342514, "kl": 0.017853411845862865, "learning_rate": 4.171111111111111e-06, "loss": 0.0009, "num_tokens": 754184.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 46.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.03857041150331497, "kl": 0.1649976149201393, "learning_rate": 4.170555555555556e-06, "loss": 0.0082, "num_tokens": 754492.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.75, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 46.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.6652421951293945, "kl": 0.07999678328633308, "learning_rate": 4.17e-06, "loss": 0.1227, "num_tokens": 754891.0, "reward": 3.5, "reward_std": 3.188521146774292, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 3.188521146774292, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 46.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.2696685492992401, "kl": 0.04207994043827057, "learning_rate": 4.169444444444445e-06, "loss": 0.0021, "num_tokens": 755165.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 46.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.13787785172462463, "kl": 0.01912989467382431, "learning_rate": 4.168888888888889e-06, "loss": 0.0006, "num_tokens": 755413.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 46.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.13217243552207947, "kl": 0.024520167149603367, "learning_rate": 4.168333333333334e-06, "loss": 0.0012, "num_tokens": 755729.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 46.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.185525581240654, "kl": 0.02174455299973488, "learning_rate": 4.167777777777778e-06, "loss": 0.0011, "num_tokens": 755973.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 46.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007135570980608463, "kl": 0.009852640330791473, "learning_rate": 4.1672222222222225e-06, "loss": 0.0005, "num_tokens": 756209.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 46.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.2352469116449356, "kl": 0.14810563065111637, "learning_rate": 4.166666666666667e-06, "loss": 0.0072, "num_tokens": 756474.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 46.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.03614146634936333, "kl": 0.027624770998954773, "learning_rate": 4.166111111111111e-06, "loss": 0.0014, "num_tokens": 756742.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 46.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.1282159984111786, "kl": 0.10515665262937546, "learning_rate": 4.1655555555555564e-06, "loss": 0.0053, "num_tokens": 757050.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 46.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.08585701137781143, "kl": 0.07519544661045074, "learning_rate": 4.165e-06, "loss": 0.0036, "num_tokens": 757369.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 46.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.10636861622333527, "kl": 0.01869631139561534, "learning_rate": 4.164444444444445e-06, "loss": 0.0009, "num_tokens": 757642.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 46.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.07705557346343994, "kl": 0.006847345008281991, "learning_rate": 4.163888888888889e-06, "loss": 0.0004, "num_tokens": 757940.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 46.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 2.143014907836914, "kl": 0.03649156913161278, "learning_rate": 4.163333333333334e-06, "loss": 0.0812, "num_tokens": 758317.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 46.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 1.8084145784378052, "kl": 0.07173210475593805, "learning_rate": 4.162777777777778e-06, "loss": -0.1586, "num_tokens": 758637.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 46.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.95266056060791, "kl": 0.12117773666977882, "learning_rate": 4.1622222222222226e-06, "loss": 0.0489, "num_tokens": 759002.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 46.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.8908913135528564, "kl": 0.026527891866862774, "learning_rate": 4.161666666666667e-06, "loss": 0.0073, "num_tokens": 759371.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 46.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07712092995643616, "kl": 0.02340590674430132, "learning_rate": 4.161111111111111e-06, "loss": 0.0012, "num_tokens": 759714.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 46.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.14373567700386047, "kl": 0.014688474126160145, "learning_rate": 4.160555555555556e-06, "loss": 0.0007, "num_tokens": 759972.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 46.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02743067778646946, "kl": 0.008696028962731361, "learning_rate": 4.16e-06, "loss": 0.0004, "num_tokens": 760284.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 46.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.11043833196163177, "kl": 0.04111274518072605, "learning_rate": 4.159444444444445e-06, "loss": 0.002, "num_tokens": 760584.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 46.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.015548611991107464, "kl": 0.0006753703055437654, "learning_rate": 4.158888888888889e-06, "loss": 0.0, "num_tokens": 760800.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 46.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.002863161964341998, "kl": 0.09034491330385208, "learning_rate": 4.158333333333334e-06, "loss": 0.0045, "num_tokens": 761164.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 46.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.09452883899211884, "kl": 0.017891965806484222, "learning_rate": 4.157777777777778e-06, "loss": 0.0009, "num_tokens": 761496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 46.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.05122678726911545, "kl": 0.004946009663399309, "learning_rate": 4.157222222222223e-06, "loss": 0.0002, "num_tokens": 761773.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 46.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.21077175438404083, "kl": 0.05352201871573925, "learning_rate": 4.156666666666667e-06, "loss": 0.0028, "num_tokens": 762062.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 46.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 3.947786808013916, "kl": 0.18862552009522915, "learning_rate": 4.156111111111111e-06, "loss": 0.0148, "num_tokens": 762356.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 46.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.00391028868034482, "kl": 0.000248700387601275, "learning_rate": 4.155555555555556e-06, "loss": 0.0, "num_tokens": 762576.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 46.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02432064712047577, "kl": 0.002543898852309212, "learning_rate": 4.155e-06, "loss": 0.0001, "num_tokens": 762848.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 46.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.015011154115200043, "kl": 0.0033867476740852, "learning_rate": 4.154444444444445e-06, "loss": 0.0002, "num_tokens": 763130.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 46.74074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.6772359609603882, "kl": 0.03905806224793196, "learning_rate": 4.153888888888889e-06, "loss": 0.0681, "num_tokens": 763416.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 46.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.3634426593780518, "kl": 0.017609330301638693, "learning_rate": 4.153333333333334e-06, "loss": 0.0008, "num_tokens": 763732.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 46.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.3271539807319641, "kl": 0.05129472818225622, "learning_rate": 4.152777777777778e-06, "loss": 0.0034, "num_tokens": 764114.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 46.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.015080343931913376, "kl": 0.03341222554445267, "learning_rate": 4.152222222222223e-06, "loss": 0.0017, "num_tokens": 764330.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2527 }, { "clip_ratio/high_max": 0.012820512987673283, "clip_ratio/high_mean": 0.012820512987673283, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 46.81481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.681430339813232, "kl": 0.06288899295032024, "learning_rate": 4.151666666666667e-06, "loss": -0.2106, "num_tokens": 764649.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 46.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.07615989446640015, "kl": 0.004460433206986636, "learning_rate": 4.151111111111111e-06, "loss": 0.0002, "num_tokens": 764905.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 46.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.03281604126095772, "kl": 0.0018713687313720584, "learning_rate": 4.150555555555556e-06, "loss": 0.0001, "num_tokens": 765229.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 46.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.02492535673081875, "kl": 0.01021040603518486, "learning_rate": 4.15e-06, "loss": 0.0005, "num_tokens": 765501.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 46.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.008151696994900703, "kl": 0.016040201298892498, "learning_rate": 4.1494444444444444e-06, "loss": 0.0008, "num_tokens": 765792.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 46.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.049023520201444626, "kl": 0.008819380775094032, "learning_rate": 4.148888888888889e-06, "loss": 0.0004, "num_tokens": 766092.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 46.925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 0.8918478488922119, "kl": 0.08285304717719555, "learning_rate": 4.148333333333334e-06, "loss": 0.0039, "num_tokens": 766562.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 46.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.4037191867828369, "kl": 0.04973078891634941, "learning_rate": 4.147777777777778e-06, "loss": 0.0025, "num_tokens": 766822.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 46.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.2507380545139313, "kl": 0.03325359337031841, "learning_rate": 4.147222222222223e-06, "loss": 0.0018, "num_tokens": 767041.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007575757801532745, "clip_ratio/low_min": 0.007575757801532745, "clip_ratio/region_mean": 0.007575757801532745, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 46.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.982953071594238, "kl": 0.10858285427093506, "learning_rate": 4.146666666666667e-06, "loss": 0.2169, "num_tokens": 767407.0, "reward": 2.375, "reward_std": 2.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 2.25, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 47.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.021798834204673767, "kl": 0.000773283711168915, "learning_rate": 4.146111111111111e-06, "loss": 0.0, "num_tokens": 767641.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 47.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.002497282810509205, "kl": 0.09049433469772339, "learning_rate": 4.145555555555556e-06, "loss": 0.0045, "num_tokens": 768005.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 47.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09578297287225723, "kl": 0.01174119208008051, "learning_rate": 4.145e-06, "loss": 0.0006, "num_tokens": 768276.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.026228073984384537, "kl": 0.032947856932878494, "learning_rate": 4.1444444444444445e-06, "loss": 0.0016, "num_tokens": 768560.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 47.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037878972943872213, "kl": 0.0001930117650772445, "learning_rate": 4.143888888888889e-06, "loss": 0.0, "num_tokens": 768780.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 47.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 12.032971382141113, "kl": 0.04332250356674194, "learning_rate": 4.143333333333334e-06, "loss": 0.0038, "num_tokens": 768988.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 47.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.030061615630984306, "kl": 0.002511048223823309, "learning_rate": 4.142777777777778e-06, "loss": 0.0001, "num_tokens": 769310.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 47.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.010965422727167606, "kl": 0.0009803101420402527, "learning_rate": 4.142222222222223e-06, "loss": 0.0, "num_tokens": 769582.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 47.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.015240363776683807, "kl": 0.033604949712753296, "learning_rate": 4.141666666666667e-06, "loss": 0.0017, "num_tokens": 769798.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 47.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 5.361410140991211, "kl": 0.08719121851027012, "learning_rate": 4.1411111111111115e-06, "loss": 0.0437, "num_tokens": 770122.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 47.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.00910953152924776, "kl": 0.001000861288048327, "learning_rate": 4.140555555555556e-06, "loss": 0.0001, "num_tokens": 770434.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2548 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.010869565419852734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010869565419852734, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 47.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.0401272773742676, "kl": 0.080459825694561, "learning_rate": 4.14e-06, "loss": -0.0217, "num_tokens": 770748.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 47.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 5.556026458740234, "kl": 0.02907082624733448, "learning_rate": 4.1394444444444445e-06, "loss": 0.2333, "num_tokens": 771006.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 47.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06816200911998749, "kl": 0.041684962809085846, "learning_rate": 4.138888888888889e-06, "loss": 0.0022, "num_tokens": 771313.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 47.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.878349781036377, "kl": 0.03047649748623371, "learning_rate": 4.138333333333333e-06, "loss": 0.1598, "num_tokens": 771654.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 77.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 77.25, "completions/mean_terminated_length": 17.666667938232422, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 47.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.2170310020446777, "kl": 0.13115517422556877, "learning_rate": 4.1377777777777784e-06, "loss": 0.4414, "num_tokens": 772203.0, "reward": 5.375, "reward_std": 5.25, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 5.25, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 47.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.764766693115234, "kl": 0.0558136273175478, "learning_rate": 4.137222222222223e-06, "loss": 0.1956, "num_tokens": 772562.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 47.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017762051720637828, "kl": 3.2514333724975586e-05, "learning_rate": 4.136666666666667e-06, "loss": 0.0, "num_tokens": 772782.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 47.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 2.195812463760376, "kl": 0.12940306216478348, "learning_rate": 4.1361111111111115e-06, "loss": 0.0688, "num_tokens": 773124.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 47.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.04035919904708862, "kl": 0.007642156444489956, "learning_rate": 4.135555555555556e-06, "loss": 0.0004, "num_tokens": 773458.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 47.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.03858639672398567, "kl": 0.0004228651523590088, "learning_rate": 4.135e-06, "loss": 0.0, "num_tokens": 773670.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 47.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 3.7550480365753174, "kl": 0.20898732542991638, "learning_rate": 4.1344444444444446e-06, "loss": 0.039, "num_tokens": 773970.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.10941924154758453, "kl": 0.021177157759666443, "learning_rate": 4.133888888888889e-06, "loss": 0.0011, "num_tokens": 774248.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 47.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.004860364831984043, "kl": 0.007467274554073811, "learning_rate": 4.133333333333333e-06, "loss": 0.0004, "num_tokens": 774560.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 47.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.14929969608783722, "kl": 0.049865636974573135, "learning_rate": 4.1327777777777785e-06, "loss": 0.0024, "num_tokens": 774886.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 47.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.31867316365242004, "kl": 0.23191051930189133, "learning_rate": 4.132222222222223e-06, "loss": 0.0117, "num_tokens": 775195.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 47.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.9620678424835205, "kl": 0.014633152750320733, "learning_rate": 4.131666666666667e-06, "loss": 0.0775, "num_tokens": 775480.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.139554962515831, "kl": 0.08788768202066422, "learning_rate": 4.1311111111111116e-06, "loss": 0.0044, "num_tokens": 775751.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 47.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.214697003364563, "kl": 0.21911336481571198, "learning_rate": 4.130555555555556e-06, "loss": -0.1194, "num_tokens": 776221.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 47.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.4700534343719482, "kl": 0.2093476578593254, "learning_rate": 4.13e-06, "loss": 0.0768, "num_tokens": 776558.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 47.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.07860653847455978, "kl": 0.008845673524774611, "learning_rate": 4.129444444444445e-06, "loss": 0.0003, "num_tokens": 776812.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 47.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 8.586929321289062, "kl": 0.18961253110319376, "learning_rate": 4.12888888888889e-06, "loss": -0.0573, "num_tokens": 777082.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 47.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.007951161824166775, "kl": 0.0013378560543060303, "learning_rate": 4.128333333333333e-06, "loss": 0.0001, "num_tokens": 777294.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2570 }, { "clip_ratio/high_max": 0.008620689623057842, "clip_ratio/high_mean": 0.008620689623057842, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008620689623057842, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 47.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 2.7530088424682617, "kl": 0.08889210596680641, "learning_rate": 4.1277777777777785e-06, "loss": -0.0275, "num_tokens": 777636.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 47.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028744323644787073, "kl": 0.018117262050509453, "learning_rate": 4.127222222222222e-06, "loss": 0.0009, "num_tokens": 777896.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 47.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 2.027186870574951, "kl": 0.13052519038319588, "learning_rate": 4.126666666666667e-06, "loss": 0.0446, "num_tokens": 778259.0, "reward": 2.75, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.190238118171692, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 47.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.10640931874513626, "kl": 0.021208051592111588, "learning_rate": 4.126111111111112e-06, "loss": 0.0011, "num_tokens": 778551.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 47.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.06598841398954391, "kl": 0.005850411951541901, "learning_rate": 4.125555555555556e-06, "loss": 0.0003, "num_tokens": 778811.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 47.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.006608160212635994, "kl": 0.02642022166401148, "learning_rate": 4.125e-06, "loss": 0.0013, "num_tokens": 779079.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 47.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.07564849406480789, "kl": 0.007870995439589024, "learning_rate": 4.124444444444445e-06, "loss": 0.0004, "num_tokens": 779337.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 47.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.11896169185638428, "kl": 0.028101020492613316, "learning_rate": 4.12388888888889e-06, "loss": 0.0015, "num_tokens": 779682.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08108922094106674, "kl": 0.013365744147449732, "learning_rate": 4.123333333333333e-06, "loss": 0.0006, "num_tokens": 779978.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 47.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 4.550353527069092, "kl": 0.11149526806548238, "learning_rate": 4.122777777777779e-06, "loss": 0.1749, "num_tokens": 780263.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 47.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.027070315554738045, "kl": 0.0005772918375441805, "learning_rate": 4.122222222222222e-06, "loss": 0.0, "num_tokens": 780519.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 47.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.012564125470817089, "kl": 0.0006582994828931987, "learning_rate": 4.121666666666667e-06, "loss": 0.0, "num_tokens": 780811.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 47.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.018763933330774307, "kl": 0.0038910636212676764, "learning_rate": 4.121111111111112e-06, "loss": 0.0002, "num_tokens": 781093.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 47.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.01385953277349472, "kl": 0.0007036711613181978, "learning_rate": 4.120555555555556e-06, "loss": 0.0, "num_tokens": 781328.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 47.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.00430035125464201, "kl": 0.010565198957920074, "learning_rate": 4.12e-06, "loss": 0.0005, "num_tokens": 781564.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 47.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 1.8104617595672607, "kl": 0.09960044175386429, "learning_rate": 4.119444444444445e-06, "loss": 0.0199, "num_tokens": 781897.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 47.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0331806056201458, "kl": 0.015274240635335445, "learning_rate": 4.118888888888889e-06, "loss": 0.0008, "num_tokens": 782198.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 47.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.02927803434431553, "kl": 0.014879277994623408, "learning_rate": 4.1183333333333334e-06, "loss": 0.0007, "num_tokens": 782494.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 47.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.18499934673309326, "kl": 0.035588097758591175, "learning_rate": 4.117777777777779e-06, "loss": 0.0019, "num_tokens": 782771.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 47.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.733246803283691, "kl": 0.10767726693302393, "learning_rate": 4.117222222222222e-06, "loss": 0.1743, "num_tokens": 783050.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 47.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.9223594665527344, "kl": 0.2577384486794472, "learning_rate": 4.116666666666667e-06, "loss": 0.0134, "num_tokens": 783358.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 48.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04086935520172119, "kl": 0.004472052678465843, "learning_rate": 4.116111111111112e-06, "loss": 0.0002, "num_tokens": 783632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 48.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01096338964998722, "kl": 0.00023325731308432296, "learning_rate": 4.115555555555556e-06, "loss": 0.0, "num_tokens": 783902.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 48.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.005125122610479593, "kl": 0.0009934842237271369, "learning_rate": 4.115e-06, "loss": 0.0, "num_tokens": 784162.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 48.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.004230940248817205, "kl": 0.010583482682704926, "learning_rate": 4.114444444444445e-06, "loss": 0.0005, "num_tokens": 784398.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.06431061029434204, "kl": 0.019002332352101803, "learning_rate": 4.113888888888889e-06, "loss": 0.001, "num_tokens": 784698.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 4.831554412841797, "kl": 0.019375002942979336, "learning_rate": 4.1133333333333335e-06, "loss": 0.2235, "num_tokens": 784990.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 48.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 1.283219814300537, "kl": 0.18471099808812141, "learning_rate": 4.112777777777779e-06, "loss": 0.0098, "num_tokens": 785264.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 48.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 8.344274520874023, "kl": 0.03610258502885699, "learning_rate": 4.112222222222222e-06, "loss": 0.0543, "num_tokens": 785538.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 48.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.014908158220350742, "kl": 0.024000167846679688, "learning_rate": 4.111666666666667e-06, "loss": 0.0012, "num_tokens": 785829.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 48.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.19674623012542725, "kl": 0.04069496504962444, "learning_rate": 4.111111111111111e-06, "loss": 0.002, "num_tokens": 786127.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 48.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 3.4919071197509766, "kl": 0.10710879787802696, "learning_rate": 4.110555555555556e-06, "loss": -0.0021, "num_tokens": 786442.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.046241760253906, "kl": 0.23682906664907932, "learning_rate": 4.1100000000000005e-06, "loss": 0.0082, "num_tokens": 786726.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.4909398555755615, "kl": 0.10223204270005226, "learning_rate": 4.109444444444445e-06, "loss": 0.0049, "num_tokens": 787010.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 48.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.14653001725673676, "kl": 0.03150942921638489, "learning_rate": 4.108888888888889e-06, "loss": 0.0016, "num_tokens": 787342.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 48.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.053502582013607025, "kl": 0.004015836864709854, "learning_rate": 4.1083333333333335e-06, "loss": 0.0002, "num_tokens": 787586.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 48.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.00301264226436615, "kl": 0.01807650923728943, "learning_rate": 4.107777777777779e-06, "loss": 0.0009, "num_tokens": 787846.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 48.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.047825973480939865, "kl": 0.0014906749711371958, "learning_rate": 4.107222222222222e-06, "loss": 0.0001, "num_tokens": 788124.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 48.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.03782776743173599, "kl": 0.0044165924191474915, "learning_rate": 4.1066666666666674e-06, "loss": 0.0002, "num_tokens": 788384.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 48.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.21308481693267822, "kl": 0.022120334208011627, "learning_rate": 4.106111111111111e-06, "loss": 0.0011, "num_tokens": 788644.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 48.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 3.2282238006591797, "kl": 0.005851496593095362, "learning_rate": 4.105555555555556e-06, "loss": 0.1078, "num_tokens": 788966.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 48.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 0.7063078284263611, "kl": 0.3331001741462387, "learning_rate": 4.1050000000000005e-06, "loss": 0.0161, "num_tokens": 789246.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 48.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.02446356602013111, "kl": 0.01938885822892189, "learning_rate": 4.104444444444445e-06, "loss": 0.001, "num_tokens": 789598.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 48.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.009490130469202995, "kl": 0.02635563351213932, "learning_rate": 4.103888888888889e-06, "loss": 0.0013, "num_tokens": 789866.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0549541637301445, "kl": 0.011215811595320702, "learning_rate": 4.1033333333333336e-06, "loss": 0.0006, "num_tokens": 790144.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 48.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.16016535460948944, "kl": 0.05393075942993164, "learning_rate": 4.102777777777778e-06, "loss": 0.0029, "num_tokens": 790486.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 48.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.014841003343462944, "kl": 0.034103237092494965, "learning_rate": 4.102222222222222e-06, "loss": 0.0017, "num_tokens": 790702.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 48.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.3060286045074463, "kl": 0.2107005938887596, "learning_rate": 4.1016666666666675e-06, "loss": -0.0699, "num_tokens": 791010.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 48.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07307449728250504, "kl": 0.00819119019433856, "learning_rate": 4.101111111111111e-06, "loss": 0.0004, "num_tokens": 791231.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 48.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017734481662046164, "kl": 3.6947429180145264e-05, "learning_rate": 4.100555555555556e-06, "loss": 0.0, "num_tokens": 791451.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 48.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.2035489082336426, "kl": 0.09711670130491257, "learning_rate": 4.1e-06, "loss": 0.0075, "num_tokens": 791808.0, "reward": 5.5, "reward_std": 2.309401035308838, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 2.309401035308838, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 48.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.10816437005996704, "kl": 0.0024197623133659363, "learning_rate": 4.099444444444445e-06, "loss": 0.0001, "num_tokens": 792020.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 48.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0860278308391571, "kl": 0.023159844800829887, "learning_rate": 4.098888888888889e-06, "loss": 0.0012, "num_tokens": 792310.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 48.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 8.530370712280273, "kl": 1.9358704686164856, "learning_rate": 4.098333333333334e-06, "loss": -0.057, "num_tokens": 792641.0, "reward": 1.75, "reward_std": 2.872281312942505, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 2.872281312942505, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 48.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.043662961572408676, "kl": 0.001445272529963404, "learning_rate": 4.097777777777778e-06, "loss": 0.0001, "num_tokens": 792897.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 48.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.03866761922836304, "kl": 0.007387477904558182, "learning_rate": 4.097222222222222e-06, "loss": 0.0004, "num_tokens": 793217.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.1599350869655609, "kl": 0.024433777667582035, "learning_rate": 4.0966666666666675e-06, "loss": 0.0012, "num_tokens": 793485.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 48.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 4.343060493469238, "kl": 0.1816269736737013, "learning_rate": 4.096111111111111e-06, "loss": -0.0264, "num_tokens": 793834.0, "reward": 5.0, "reward_std": 3.316624879837036, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.316624879837036, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 48.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.3916577100753784, "kl": 0.04098367691040039, "learning_rate": 4.095555555555556e-06, "loss": 0.0025, "num_tokens": 794042.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 48.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.359607219696045, "kl": 0.24602558463811874, "learning_rate": 4.095e-06, "loss": -0.0067, "num_tokens": 794345.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 48.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.02802380546927452, "kl": 0.004992792120901868, "learning_rate": 4.094444444444445e-06, "loss": 0.0003, "num_tokens": 794643.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 48.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.007828316651284695, "kl": 0.0011133402585983276, "learning_rate": 4.093888888888889e-06, "loss": 0.0001, "num_tokens": 794855.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 48.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.7872986793518066, "kl": 0.11769361793994904, "learning_rate": 4.093333333333334e-06, "loss": -0.0524, "num_tokens": 795211.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 48.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.15968212485313416, "kl": 0.012567598998430185, "learning_rate": 4.092777777777778e-06, "loss": 0.0008, "num_tokens": 795452.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 48.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.5918221473693848, "kl": 0.10611317306756973, "learning_rate": 4.092222222222222e-06, "loss": 0.1598, "num_tokens": 795817.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 48.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.031579528003931046, "kl": 0.0046273956540971994, "learning_rate": 4.091666666666667e-06, "loss": 0.0002, "num_tokens": 796130.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 48.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.08614679425954819, "kl": 0.0696187112480402, "learning_rate": 4.091111111111111e-06, "loss": 0.0034, "num_tokens": 796497.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 48.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021373326890170574, "kl": 0.09058188274502754, "learning_rate": 4.090555555555556e-06, "loss": 0.0045, "num_tokens": 796861.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 48.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.1202026978135109, "kl": 0.03565598372370005, "learning_rate": 4.09e-06, "loss": 0.0019, "num_tokens": 797150.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 48.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 2.8187458515167236, "kl": 0.34921011328697205, "learning_rate": 4.089444444444445e-06, "loss": 0.0163, "num_tokens": 797623.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 48.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.16990986466407776, "kl": 0.06016245484352112, "learning_rate": 4.088888888888889e-06, "loss": 0.0032, "num_tokens": 797927.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 48.925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 4.747857093811035, "kl": 0.20558230578899384, "learning_rate": 4.088333333333334e-06, "loss": 0.0675, "num_tokens": 798246.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 48.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07316245138645172, "kl": 0.00858387048356235, "learning_rate": 4.087777777777778e-06, "loss": 0.0004, "num_tokens": 798568.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 48.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.36428219079971313, "kl": 0.06037887465208769, "learning_rate": 4.0872222222222224e-06, "loss": 0.003, "num_tokens": 798864.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 48.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.2034560590982437, "kl": 0.04844702035188675, "learning_rate": 4.086666666666667e-06, "loss": 0.0024, "num_tokens": 799180.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 49.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.045356497168540955, "kl": 0.005775585072115064, "learning_rate": 4.086111111111111e-06, "loss": 0.0003, "num_tokens": 799438.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 49.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.021811578422784805, "kl": 0.0013654259964823723, "learning_rate": 4.085555555555556e-06, "loss": 0.0001, "num_tokens": 799754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 49.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 1.19867742061615, "kl": 0.17661643773317337, "learning_rate": 4.085e-06, "loss": 0.0093, "num_tokens": 800225.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 49.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.1767161637544632, "kl": 0.027717129210941494, "learning_rate": 4.084444444444445e-06, "loss": 0.0014, "num_tokens": 800507.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 49.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 8.70537281036377, "kl": 0.054378073662519455, "learning_rate": 4.083888888888889e-06, "loss": 0.1455, "num_tokens": 800811.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 49.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 3.2035675048828125, "kl": 0.016266703139990568, "learning_rate": 4.083333333333334e-06, "loss": -0.0142, "num_tokens": 801084.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 49.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019767521880567074, "kl": 0.001812822069041431, "learning_rate": 4.082777777777778e-06, "loss": 0.0001, "num_tokens": 801364.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 49.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 3.4839532375335693, "kl": 0.11346525512635708, "learning_rate": 4.0822222222222225e-06, "loss": 0.0208, "num_tokens": 801717.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.1173497885465622, "kl": 0.02187745738774538, "learning_rate": 4.081666666666667e-06, "loss": 0.0011, "num_tokens": 802016.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 49.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015371311747003347, "kl": 4.6603381633758545e-05, "learning_rate": 4.081111111111111e-06, "loss": 0.0, "num_tokens": 802236.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 49.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.04202881082892418, "kl": 0.030999218113720417, "learning_rate": 4.0805555555555555e-06, "loss": 0.0015, "num_tokens": 802578.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 49.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06434456259012222, "kl": 0.011959620285779238, "learning_rate": 4.08e-06, "loss": 0.0006, "num_tokens": 802869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 49.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.08219822496175766, "kl": 0.05049626901745796, "learning_rate": 4.079444444444445e-06, "loss": 0.0024, "num_tokens": 803224.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 49.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 7.9797043800354, "kl": 0.026751229190267622, "learning_rate": 4.0788888888888895e-06, "loss": 0.1939, "num_tokens": 803464.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 49.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.019988538697361946, "kl": 0.03138643503189087, "learning_rate": 4.078333333333334e-06, "loss": 0.0016, "num_tokens": 803680.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 49.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.027068186551332474, "kl": 0.0005431353929452598, "learning_rate": 4.077777777777778e-06, "loss": 0.0, "num_tokens": 803936.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 49.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.445765495300293, "kl": 0.19393999874591827, "learning_rate": 4.0772222222222225e-06, "loss": 0.0507, "num_tokens": 804252.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 49.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067919655703008175, "kl": 0.0003070045495405793, "learning_rate": 4.076666666666667e-06, "loss": 0.0, "num_tokens": 804548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2663 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 49.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 2.626190662384033, "kl": 0.09030477795749903, "learning_rate": 4.076111111111111e-06, "loss": -0.0149, "num_tokens": 804842.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 49.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 4.8105998039245605, "kl": 0.07477825880050659, "learning_rate": 4.075555555555556e-06, "loss": 0.0456, "num_tokens": 805178.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 49.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.011717362329363823, "kl": 0.0034538283944129944, "learning_rate": 4.075e-06, "loss": 0.0002, "num_tokens": 805438.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 49.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.6538939476013184, "kl": 0.07091031526215374, "learning_rate": 4.074444444444445e-06, "loss": 0.0041, "num_tokens": 805659.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 49.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.0964901447296143, "kl": 0.021440504118800163, "learning_rate": 4.0738888888888895e-06, "loss": -0.0, "num_tokens": 805987.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 49.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0349719300866127, "kl": 0.0019005201756954193, "learning_rate": 4.073333333333334e-06, "loss": 0.0001, "num_tokens": 806231.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.12053936719894409, "kl": 0.08749377727508545, "learning_rate": 4.072777777777778e-06, "loss": 0.0043, "num_tokens": 806504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 49.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.005705568473786116, "kl": 0.010165013372898102, "learning_rate": 4.0722222222222226e-06, "loss": 0.0005, "num_tokens": 806740.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 49.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.6134487390518188, "kl": 0.03810116648674011, "learning_rate": 4.071666666666667e-06, "loss": -0.0442, "num_tokens": 807102.0, "reward": 5.0, "reward_std": 5.0, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 5.0, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 49.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.08847735077142715, "kl": 0.007870703935623169, "learning_rate": 4.071111111111111e-06, "loss": 0.0004, "num_tokens": 807308.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 49.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.007781932596117258, "kl": 0.0013917088508605957, "learning_rate": 4.070555555555556e-06, "loss": 0.0001, "num_tokens": 807520.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 49.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06794355809688568, "kl": 0.02206044690683484, "learning_rate": 4.07e-06, "loss": 0.0011, "num_tokens": 807836.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.49463018774986267, "kl": 0.06431871093809605, "learning_rate": 4.069444444444444e-06, "loss": 0.0041, "num_tokens": 808122.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 49.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031227311119437218, "kl": 0.018009627237915993, "learning_rate": 4.0688888888888896e-06, "loss": 0.0009, "num_tokens": 808382.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 49.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.03158348798751831, "kl": 0.03199963131919503, "learning_rate": 4.068333333333334e-06, "loss": 0.0016, "num_tokens": 808672.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 49.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 5.10353946685791, "kl": 0.12345730140805244, "learning_rate": 4.067777777777778e-06, "loss": -0.0119, "num_tokens": 808997.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 49.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 2.1655938625335693, "kl": 0.05833258852362633, "learning_rate": 4.067222222222223e-06, "loss": 0.1086, "num_tokens": 809377.0, "reward": 3.5, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 2.915475845336914, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 2.9801902770996094, "kl": 0.057168591767549515, "learning_rate": 4.066666666666667e-06, "loss": 0.0362, "num_tokens": 809657.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 49.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.27581995725631714, "kl": 0.08562816120684147, "learning_rate": 4.066111111111111e-06, "loss": 0.0044, "num_tokens": 809928.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 49.68518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 1.5477654933929443, "kl": 0.11586001515388489, "learning_rate": 4.065555555555556e-06, "loss": -0.0682, "num_tokens": 810291.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 49.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.5755021572113037, "kl": 0.09809911996126175, "learning_rate": 4.065e-06, "loss": 0.003, "num_tokens": 810649.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 49.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.07206225395202637, "kl": 0.003446408867603168, "learning_rate": 4.064444444444444e-06, "loss": 0.0002, "num_tokens": 810925.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 49.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.08185591548681259, "kl": 0.2801016867160797, "learning_rate": 4.06388888888889e-06, "loss": 0.014, "num_tokens": 811229.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 49.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.126208782196045, "kl": 0.1496178365778178, "learning_rate": 4.063333333333334e-06, "loss": 0.1422, "num_tokens": 811494.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 49.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.10313506424427032, "kl": 0.007681076880544424, "learning_rate": 4.062777777777778e-06, "loss": 0.0004, "num_tokens": 811764.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 49.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.219470977783203, "kl": 0.12928896769881248, "learning_rate": 4.062222222222223e-06, "loss": -0.0365, "num_tokens": 812073.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 49.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.006968971341848373, "kl": 0.0014497876400128007, "learning_rate": 4.061666666666667e-06, "loss": 0.0001, "num_tokens": 812333.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 49.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.13151989877223969, "kl": 0.006864190101623535, "learning_rate": 4.061111111111111e-06, "loss": 0.0003, "num_tokens": 812546.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 49.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.09733017534017563, "kl": 0.015424824319779873, "learning_rate": 4.060555555555556e-06, "loss": 0.0008, "num_tokens": 812814.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 49.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.23689332604408264, "kl": 0.04875888675451279, "learning_rate": 4.060000000000001e-06, "loss": 0.0026, "num_tokens": 813177.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.030801482498645782, "kl": 0.03266260027885437, "learning_rate": 4.0594444444444444e-06, "loss": 0.0016, "num_tokens": 813461.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 49.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.033689793199300766, "kl": 0.032630862668156624, "learning_rate": 4.05888888888889e-06, "loss": 0.0016, "num_tokens": 813729.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 49.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.009693292900919914, "kl": 0.08947086334228516, "learning_rate": 4.058333333333333e-06, "loss": 0.0045, "num_tokens": 814094.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 49.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 5.487831115722656, "kl": 0.015484350442420691, "learning_rate": 4.057777777777778e-06, "loss": 0.0372, "num_tokens": 814416.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 49.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00458831200376153, "kl": 0.007901026867330074, "learning_rate": 4.057222222222223e-06, "loss": 0.0004, "num_tokens": 814728.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 49.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251497894525528, "kl": 0.04311263561248779, "learning_rate": 4.056666666666667e-06, "loss": 0.0022, "num_tokens": 815020.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 50.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.17389729619026184, "kl": 0.051268475130200386, "learning_rate": 4.0561111111111114e-06, "loss": 0.0026, "num_tokens": 815317.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 50.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.038415856659412384, "kl": 0.003865062586555723, "learning_rate": 4.055555555555556e-06, "loss": 0.0003, "num_tokens": 815543.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 50.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.14405851066112518, "kl": 0.3103042244911194, "learning_rate": 4.055000000000001e-06, "loss": 0.0155, "num_tokens": 815847.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 50.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02997625432908535, "kl": 0.004163100791629404, "learning_rate": 4.0544444444444445e-06, "loss": 0.0002, "num_tokens": 816169.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 50.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.028582559898495674, "kl": 0.007906584069132805, "learning_rate": 4.05388888888889e-06, "loss": 0.0004, "num_tokens": 816481.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.14373166859149933, "kl": 0.03305418603122234, "learning_rate": 4.053333333333333e-06, "loss": 0.0017, "num_tokens": 816781.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.2387031465768814, "kl": 0.0987517274916172, "learning_rate": 4.052777777777778e-06, "loss": 0.0049, "num_tokens": 817049.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 50.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.089170902967453, "kl": 0.012737357581499964, "learning_rate": 4.052222222222223e-06, "loss": 0.0007, "num_tokens": 817363.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.05459803342819214, "kl": 0.010002813069149852, "learning_rate": 4.051666666666667e-06, "loss": 0.0005, "num_tokens": 817641.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 50.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.10188326984643936, "kl": 0.01283614942803979, "learning_rate": 4.0511111111111115e-06, "loss": 0.0006, "num_tokens": 817874.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 50.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013951380969956517, "kl": 5.404651165008545e-05, "learning_rate": 4.050555555555556e-06, "loss": 0.0, "num_tokens": 818094.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 50.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.052707068622112274, "kl": 0.015673666261136532, "learning_rate": 4.05e-06, "loss": 0.0008, "num_tokens": 818418.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 50.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.19024749100208282, "kl": 0.05695374682545662, "learning_rate": 4.0494444444444445e-06, "loss": 0.003, "num_tokens": 818779.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 50.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.3285292685031891, "kl": 0.04714204929769039, "learning_rate": 4.04888888888889e-06, "loss": 0.0025, "num_tokens": 818998.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 50.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023603145964443684, "kl": 0.018220335245132446, "learning_rate": 4.048333333333333e-06, "loss": 0.0009, "num_tokens": 819258.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 50.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 3.6233162879943848, "kl": 0.1695479080080986, "learning_rate": 4.0477777777777785e-06, "loss": 0.0126, "num_tokens": 819577.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 50.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.12564034759998322, "kl": 0.02794989012181759, "learning_rate": 4.047222222222222e-06, "loss": 0.0014, "num_tokens": 819905.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 50.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0237303264439106, "kl": 0.003824383020401001, "learning_rate": 4.046666666666667e-06, "loss": 0.0002, "num_tokens": 820165.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 50.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.15126870572566986, "kl": 0.08249524608254433, "learning_rate": 4.0461111111111115e-06, "loss": 0.0042, "num_tokens": 820468.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 50.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.2833561897277832, "kl": 0.05747674033045769, "learning_rate": 4.045555555555556e-06, "loss": 0.0029, "num_tokens": 820771.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.060817938297986984, "kl": 0.029866088181734085, "learning_rate": 4.045e-06, "loss": 0.0015, "num_tokens": 821039.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 50.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 2.756636381149292, "kl": 0.03746424335986376, "learning_rate": 4.044444444444445e-06, "loss": 0.2709, "num_tokens": 821460.0, "reward": 7.125, "reward_std": 0.75, "rewards/reward_combined/mean": 7.125, "rewards/reward_combined/std": 0.75, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 50.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.24509547650814056, "kl": 0.048160421662032604, "learning_rate": 4.04388888888889e-06, "loss": 0.0026, "num_tokens": 821794.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2722 }, { "clip_ratio/high_max": 0.008196720853447914, "clip_ratio/high_mean": 0.008196720853447914, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008196720853447914, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 50.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 2.1966304779052734, "kl": 0.12312949448823929, "learning_rate": 4.043333333333333e-06, "loss": 0.0045, "num_tokens": 822144.0, "reward": 3.25, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 0.28867512941360474, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 50.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.007628944236785173, "kl": 0.009935982525348663, "learning_rate": 4.0427777777777785e-06, "loss": 0.0005, "num_tokens": 822380.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 50.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.10640982538461685, "kl": 0.027986633591353893, "learning_rate": 4.042222222222222e-06, "loss": 0.0014, "num_tokens": 822716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 50.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07960741966962814, "kl": 0.01602306915447116, "learning_rate": 4.041666666666667e-06, "loss": 0.0008, "num_tokens": 823004.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 50.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.10263683646917343, "kl": 0.01484584016725421, "learning_rate": 4.0411111111111116e-06, "loss": 0.0007, "num_tokens": 823270.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 50.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0913827046751976, "kl": 0.03966493718326092, "learning_rate": 4.040555555555556e-06, "loss": 0.002, "num_tokens": 823605.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 50.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.9948644638061523, "kl": 0.19100388884544373, "learning_rate": 4.04e-06, "loss": 0.0168, "num_tokens": 823934.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 50.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 1.051844596862793, "kl": 0.20092501118779182, "learning_rate": 4.039444444444445e-06, "loss": 0.0102, "num_tokens": 824390.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 50.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.003345109522342682, "kl": 0.00017402321100234985, "learning_rate": 4.038888888888889e-06, "loss": 0.0, "num_tokens": 824602.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.00676038209348917, "kl": 0.00300746934954077, "learning_rate": 4.038333333333333e-06, "loss": 0.0002, "num_tokens": 824886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 50.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.002437782473862171, "kl": 0.09053615853190422, "learning_rate": 4.0377777777777786e-06, "loss": 0.0045, "num_tokens": 825250.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 50.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 2.8063671588897705, "kl": 0.06257302686572075, "learning_rate": 4.037222222222222e-06, "loss": 0.0511, "num_tokens": 825522.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 50.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.08870581537485123, "kl": 0.006522701762150973, "learning_rate": 4.036666666666667e-06, "loss": 0.0004, "num_tokens": 825786.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 50.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.03958817198872566, "kl": 0.0012447029585018754, "learning_rate": 4.036111111111112e-06, "loss": 0.0001, "num_tokens": 826042.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 50.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.27646923065185547, "kl": 0.027681004256010056, "learning_rate": 4.035555555555556e-06, "loss": 0.0013, "num_tokens": 826308.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 89.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 89.75, "completions/mean_terminated_length": 34.333335876464844, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 50.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.576460361480713, "kl": 0.07309076935052872, "learning_rate": 4.035e-06, "loss": 0.3844, "num_tokens": 826895.0, "reward": 5.925000190734863, "reward_std": 3.8239378929138184, "rewards/reward_combined/mean": 5.925000190734863, "rewards/reward_combined/std": 3.8239378929138184, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 50.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.17673730850219727, "kl": 0.03797088083229028, "learning_rate": 4.034444444444445e-06, "loss": 0.0018, "num_tokens": 827173.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 50.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01096351258456707, "kl": 0.0004047602415084839, "learning_rate": 4.033888888888889e-06, "loss": 0.0, "num_tokens": 827377.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.218863010406494, "kl": 0.03800387494266033, "learning_rate": 4.033333333333333e-06, "loss": 0.3265, "num_tokens": 827692.0, "reward": 4.875, "reward_std": 3.1983067989349365, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.1983067989349365, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 50.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.05898116156458855, "kl": 0.007683770498260856, "learning_rate": 4.032777777777779e-06, "loss": 0.0004, "num_tokens": 827950.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08045131713151932, "kl": 0.015898645855486393, "learning_rate": 4.032222222222222e-06, "loss": 0.0008, "num_tokens": 828225.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.031735729426145554, "kl": 0.021450305357575417, "learning_rate": 4.031666666666667e-06, "loss": 0.0011, "num_tokens": 828501.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 50.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.012354421429336071, "kl": 0.002554193139076233, "learning_rate": 4.031111111111111e-06, "loss": 0.0001, "num_tokens": 828745.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 50.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.6442304253578186, "kl": 0.2168232798576355, "learning_rate": 4.030555555555556e-06, "loss": 0.011, "num_tokens": 829055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 50.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859634280204773, "kl": 0.030582631356082857, "learning_rate": 4.03e-06, "loss": 0.0015, "num_tokens": 829351.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 50.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.051735807210206985, "kl": 0.019822789821773767, "learning_rate": 4.029444444444445e-06, "loss": 0.001, "num_tokens": 829644.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 50.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.015282036736607552, "kl": 0.035875141620635986, "learning_rate": 4.028888888888889e-06, "loss": 0.0018, "num_tokens": 829928.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 50.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.2394024133682251, "kl": 0.09921546280384064, "learning_rate": 4.0283333333333334e-06, "loss": 0.0047, "num_tokens": 830262.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 50.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.12112286686897278, "kl": 0.004417027113959193, "learning_rate": 4.027777777777779e-06, "loss": 0.0003, "num_tokens": 830478.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 50.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.751091718673706, "kl": 0.07582315430045128, "learning_rate": 4.027222222222222e-06, "loss": -0.158, "num_tokens": 830811.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 50.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.011615704745054245, "kl": 0.0009362673154100776, "learning_rate": 4.026666666666667e-06, "loss": 0.0, "num_tokens": 831083.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 51.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.043848343193531036, "kl": 0.01364201307296753, "learning_rate": 4.026111111111111e-06, "loss": 0.0007, "num_tokens": 831389.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 51.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05231159180402756, "kl": 0.12365646287798882, "learning_rate": 4.025555555555556e-06, "loss": 0.0062, "num_tokens": 831695.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 51.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06116216257214546, "kl": 0.02782573807053268, "learning_rate": 4.0250000000000004e-06, "loss": 0.0021, "num_tokens": 832000.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 51.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03797819837927818, "kl": 0.013321518432348967, "learning_rate": 4.024444444444445e-06, "loss": 0.0007, "num_tokens": 832306.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 51.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.025775158777832985, "kl": 0.008090959396213293, "learning_rate": 4.023888888888889e-06, "loss": 0.0004, "num_tokens": 832578.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 51.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.1998918354511261, "kl": 0.005730785429477692, "learning_rate": 4.0233333333333335e-06, "loss": 0.0004, "num_tokens": 832786.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 51.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.03295417129993439, "kl": 0.032498933374881744, "learning_rate": 4.022777777777778e-06, "loss": 0.0016, "num_tokens": 833118.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 51.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 5.0125226974487305, "kl": 0.03256931155920029, "learning_rate": 4.022222222222222e-06, "loss": 0.1017, "num_tokens": 833410.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 51.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.01164338830858469, "kl": 0.0011518668325152248, "learning_rate": 4.021666666666667e-06, "loss": 0.0001, "num_tokens": 833688.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 51.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.1066206693649292, "kl": 0.2539501339197159, "learning_rate": 4.021111111111111e-06, "loss": 0.0127, "num_tokens": 833993.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 51.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.004737009294331074, "kl": 0.0036779120564460754, "learning_rate": 4.020555555555556e-06, "loss": 0.0002, "num_tokens": 834253.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 51.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020563495345413685, "kl": 0.018283572979271412, "learning_rate": 4.0200000000000005e-06, "loss": 0.0009, "num_tokens": 834513.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 51.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.005331563297659159, "kl": 0.0011047624866478145, "learning_rate": 4.019444444444445e-06, "loss": 0.0001, "num_tokens": 834832.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 51.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.17215876281261444, "kl": 0.03626284468919039, "learning_rate": 4.018888888888889e-06, "loss": 0.0018, "num_tokens": 835132.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 51.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.019990094006061554, "kl": 0.03266142029315233, "learning_rate": 4.0183333333333335e-06, "loss": 0.0017, "num_tokens": 835402.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 51.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.049160104244947433, "kl": 0.03457197058014572, "learning_rate": 4.017777777777778e-06, "loss": 0.0017, "num_tokens": 835694.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 51.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.048793092370033264, "kl": 0.028072807006537914, "learning_rate": 4.017222222222222e-06, "loss": 0.0014, "num_tokens": 836051.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 51.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.605368614196777, "kl": 0.03595086187124252, "learning_rate": 4.0166666666666675e-06, "loss": 0.0475, "num_tokens": 836348.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 51.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.17968162894248962, "kl": 0.10588159039616585, "learning_rate": 4.016111111111111e-06, "loss": 0.0058, "num_tokens": 836738.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 51.351851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 4.980489253997803, "kl": 0.07613779418170452, "learning_rate": 4.015555555555556e-06, "loss": 0.2211, "num_tokens": 837019.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 51.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 8.71817398071289, "kl": 0.03549391851993278, "learning_rate": 4.0150000000000005e-06, "loss": 0.1313, "num_tokens": 837235.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 51.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.011085791513323784, "kl": 0.001993313431739807, "learning_rate": 4.014444444444445e-06, "loss": 0.0001, "num_tokens": 837479.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 51.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.015715235844254494, "kl": 0.012415406294167042, "learning_rate": 4.013888888888889e-06, "loss": 0.0006, "num_tokens": 837795.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 51.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.22969426214694977, "kl": 0.019889578048605472, "learning_rate": 4.013333333333334e-06, "loss": 0.0011, "num_tokens": 838095.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 51.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021361815743148327, "kl": 0.0006711635505780578, "learning_rate": 4.012777777777778e-06, "loss": 0.0, "num_tokens": 838409.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 51.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06354420632123947, "kl": 0.028223919682204723, "learning_rate": 4.012222222222222e-06, "loss": 0.0014, "num_tokens": 838741.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 51.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.009872060269117355, "kl": 0.0028295934171183035, "learning_rate": 4.011666666666667e-06, "loss": 0.0001, "num_tokens": 838959.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 51.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.01042289286851883, "kl": 0.0002461165131535381, "learning_rate": 4.011111111111111e-06, "loss": 0.0, "num_tokens": 839215.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 51.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.1676218509674072, "kl": 0.033197094686329365, "learning_rate": 4.010555555555556e-06, "loss": 0.0662, "num_tokens": 839493.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 51.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.763333797454834, "kl": 0.11543813720345497, "learning_rate": 4.0100000000000006e-06, "loss": 0.1162, "num_tokens": 839828.0, "reward": 6.625, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.0966243743896484, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 51.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.10864896327257156, "kl": 0.03396129235625267, "learning_rate": 4.009444444444445e-06, "loss": 0.0017, "num_tokens": 840154.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 51.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0046960837207734585, "kl": 0.007737419568002224, "learning_rate": 4.008888888888889e-06, "loss": 0.0004, "num_tokens": 840466.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 51.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.00950843095779419, "kl": 0.00959121435880661, "learning_rate": 4.008333333333334e-06, "loss": 0.0005, "num_tokens": 840702.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 51.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 5.464286804199219, "kl": 0.009934860107023269, "learning_rate": 4.007777777777778e-06, "loss": 0.1951, "num_tokens": 840975.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 51.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.14902332425117493, "kl": 0.03984348103404045, "learning_rate": 4.007222222222222e-06, "loss": 0.002, "num_tokens": 841271.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 72.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 72.5, "completions/mean_terminated_length": 11.333333969116211, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 51.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 1.8395956754684448, "kl": 0.010641406755894423, "learning_rate": 4.006666666666667e-06, "loss": 0.4571, "num_tokens": 841789.0, "reward": 6.050000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 6.050000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 51.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 1.0821771621704102, "kl": 0.12076600268483162, "learning_rate": 4.006111111111111e-06, "loss": 0.0089, "num_tokens": 842034.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 51.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.014136845245957375, "kl": 0.17300479859113693, "learning_rate": 4.005555555555556e-06, "loss": 0.0087, "num_tokens": 842342.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 51.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.007909612730145454, "kl": 0.0008131563663482666, "learning_rate": 4.005000000000001e-06, "loss": 0.0, "num_tokens": 842554.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 51.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.2029624581336975, "kl": 0.09578743763267994, "learning_rate": 4.004444444444445e-06, "loss": 0.0052, "num_tokens": 842863.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 51.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015608829562552273, "kl": 4.728883504867554e-05, "learning_rate": 4.003888888888889e-06, "loss": 0.0, "num_tokens": 843083.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 51.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.007601125165820122, "kl": 0.003088561468757689, "learning_rate": 4.003333333333334e-06, "loss": 0.0002, "num_tokens": 843367.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 51.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.002563565270975232, "kl": 0.09050993621349335, "learning_rate": 4.002777777777778e-06, "loss": 0.0045, "num_tokens": 843731.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 51.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02894725650548935, "kl": 0.03337625414133072, "learning_rate": 4.002222222222222e-06, "loss": 0.0017, "num_tokens": 844015.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 51.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.46102604269981384, "kl": 0.06930369511246681, "learning_rate": 4.001666666666667e-06, "loss": 0.0041, "num_tokens": 844271.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 51.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 2.730393886566162, "kl": 0.1864490583539009, "learning_rate": 4.001111111111111e-06, "loss": 0.1069, "num_tokens": 844606.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 51.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.042966537177562714, "kl": 0.00204540352569893, "learning_rate": 4.0005555555555555e-06, "loss": 0.0001, "num_tokens": 844876.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 51.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.09104964882135391, "kl": 0.061967603862285614, "learning_rate": 4.000000000000001e-06, "loss": 0.0031, "num_tokens": 845182.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 80.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 80.25, "completions/mean_terminated_length": 21.666667938232422, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 51.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 1.6456042528152466, "kl": 0.0229738587513566, "learning_rate": 3.999444444444445e-06, "loss": 0.4378, "num_tokens": 845755.0, "reward": 4.875, "reward_std": 5.25, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 5.25, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 51.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.025559797883033752, "kl": 0.008524391800165176, "learning_rate": 3.998888888888889e-06, "loss": 0.0004, "num_tokens": 846021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 51.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.015064124017953873, "kl": 0.0006008360942360014, "learning_rate": 3.998333333333334e-06, "loss": 0.0, "num_tokens": 846257.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 51.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.0986320972442627, "kl": 0.07637502253055573, "learning_rate": 3.997777777777778e-06, "loss": -0.0045, "num_tokens": 846622.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 51.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.257500648498535, "kl": 0.06969080120325089, "learning_rate": 3.9972222222222224e-06, "loss": -0.0737, "num_tokens": 846973.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 51.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.027185674756765366, "kl": 0.05059566907584667, "learning_rate": 3.996666666666667e-06, "loss": 0.0025, "num_tokens": 847425.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 52.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09351088106632233, "kl": 0.003634140593931079, "learning_rate": 3.996111111111111e-06, "loss": 0.0002, "num_tokens": 847683.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 52.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.18222758173942566, "kl": 0.04181910306215286, "learning_rate": 3.9955555555555555e-06, "loss": 0.0021, "num_tokens": 847978.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 52.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08647073060274124, "kl": 0.01545223779976368, "learning_rate": 3.995000000000001e-06, "loss": 0.0007, "num_tokens": 848279.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 79.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 79.5, "completions/mean_terminated_length": 20.666667938232422, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 52.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.648192048072815, "kl": 0.03354305401444435, "learning_rate": 3.994444444444445e-06, "loss": 0.2781, "num_tokens": 848821.0, "reward": 3.924999952316284, "reward_std": 4.8637261390686035, "rewards/reward_combined/mean": 3.924999952316284, "rewards/reward_combined/std": 4.8637261390686035, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 52.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036926025059074163, "kl": 0.00045140981092117727, "learning_rate": 3.9938888888888894e-06, "loss": 0.0, "num_tokens": 849041.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 52.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.023250672966241837, "kl": 0.004036370664834976, "learning_rate": 3.993333333333334e-06, "loss": 0.0002, "num_tokens": 849301.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 52.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.007537446450442076, "kl": 0.009919345378875732, "learning_rate": 3.992777777777778e-06, "loss": 0.0005, "num_tokens": 849537.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 52.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.004632078111171722, "kl": 0.0020077265799045563, "learning_rate": 3.9922222222222225e-06, "loss": 0.0001, "num_tokens": 849781.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 52.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.13782809674739838, "kl": 0.038729386404156685, "learning_rate": 3.991666666666667e-06, "loss": 0.002, "num_tokens": 850134.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 52.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 3.1650729179382324, "kl": 0.0605238932184875, "learning_rate": 3.991111111111112e-06, "loss": 0.0162, "num_tokens": 850427.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 52.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.1835765391588211, "kl": 0.14019063860177994, "learning_rate": 3.9905555555555556e-06, "loss": 0.0069, "num_tokens": 850729.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 52.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009411706123501062, "kl": 0.0018760456005111337, "learning_rate": 3.990000000000001e-06, "loss": 0.0001, "num_tokens": 851009.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 52.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.2091541737318039, "kl": 0.052779531106352806, "learning_rate": 3.989444444444444e-06, "loss": 0.0023, "num_tokens": 851340.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 52.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.4107294976711273, "kl": 0.02293887734413147, "learning_rate": 3.9888888888888895e-06, "loss": 0.0011, "num_tokens": 851552.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 52.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04322386160492897, "kl": 0.002730356063693762, "learning_rate": 3.988333333333334e-06, "loss": 0.0001, "num_tokens": 851787.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 52.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.015248262323439121, "kl": 0.0033239974873140454, "learning_rate": 3.987777777777778e-06, "loss": 0.0002, "num_tokens": 852071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 52.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016293074004352093, "kl": 6.152093556011096e-05, "learning_rate": 3.9872222222222225e-06, "loss": 0.0, "num_tokens": 852341.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 52.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.05544976890087128, "kl": 0.007835612632334232, "learning_rate": 3.986666666666667e-06, "loss": 0.0004, "num_tokens": 852653.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 52.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.012157541699707508, "kl": 0.0017687869840301573, "learning_rate": 3.986111111111112e-06, "loss": 0.0001, "num_tokens": 852933.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 52.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.11248403787612915, "kl": 0.053952621296048164, "learning_rate": 3.985555555555556e-06, "loss": 0.0026, "num_tokens": 853285.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 52.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.07224182784557343, "kl": 0.027989172376692295, "learning_rate": 3.985000000000001e-06, "loss": 0.0018, "num_tokens": 853574.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 52.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 3.3690154552459717, "kl": 0.015884924679994583, "learning_rate": 3.984444444444444e-06, "loss": -0.021, "num_tokens": 853892.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 52.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.1877214908599854, "kl": 0.22262827306985855, "learning_rate": 3.9838888888888895e-06, "loss": 0.0263, "num_tokens": 854197.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 52.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0931578129529953, "kl": 0.005511451279744506, "learning_rate": 3.983333333333334e-06, "loss": 0.0003, "num_tokens": 854459.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 52.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 5.001567363739014, "kl": 0.17256412282586098, "learning_rate": 3.982777777777778e-06, "loss": -0.0373, "num_tokens": 854725.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 52.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.052402228116989136, "kl": 0.007385361357592046, "learning_rate": 3.982222222222223e-06, "loss": 0.0004, "num_tokens": 854986.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 52.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.007393678650259972, "kl": 0.000606924295425415, "learning_rate": 3.981666666666667e-06, "loss": 0.0, "num_tokens": 855198.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 52.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.024900535121560097, "kl": 0.004856374580413103, "learning_rate": 3.981111111111111e-06, "loss": 0.0002, "num_tokens": 855458.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 52.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03949984163045883, "kl": 0.17216356843709946, "learning_rate": 3.980555555555556e-06, "loss": 0.0086, "num_tokens": 855768.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 52.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08847565203905106, "kl": 0.04010123386979103, "learning_rate": 3.980000000000001e-06, "loss": 0.002, "num_tokens": 856060.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 52.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.002075454918667674, "kl": 0.00026323198835598305, "learning_rate": 3.979444444444444e-06, "loss": 0.0, "num_tokens": 856316.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 52.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.014619325287640095, "kl": 0.0015523095498792827, "learning_rate": 3.9788888888888896e-06, "loss": 0.0001, "num_tokens": 856636.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 52.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 3.166289806365967, "kl": 0.16122709959745407, "learning_rate": 3.978333333333333e-06, "loss": -0.0181, "num_tokens": 856988.0, "reward": 3.375, "reward_std": 0.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 0.25, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 52.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.051654260605573654, "kl": 0.034988428466022015, "learning_rate": 3.977777777777778e-06, "loss": 0.0017, "num_tokens": 857280.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 52.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.058574043214321136, "kl": 0.005594709422439337, "learning_rate": 3.977222222222223e-06, "loss": 0.0002, "num_tokens": 857534.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 52.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.004828379489481449, "kl": 0.017649253830313683, "learning_rate": 3.976666666666667e-06, "loss": 0.0009, "num_tokens": 857794.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 52.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 1.0645865201950073, "kl": 0.12628623098134995, "learning_rate": 3.976111111111111e-06, "loss": -0.0516, "num_tokens": 858232.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 52.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.02166857197880745, "kl": 0.0020010098814964294, "learning_rate": 3.975555555555556e-06, "loss": 0.0001, "num_tokens": 858438.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 52.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.9028234481811523, "kl": 0.1570107340812683, "learning_rate": 3.975000000000001e-06, "loss": 0.0628, "num_tokens": 858775.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 52.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.187795877456665, "kl": 0.16260290890932083, "learning_rate": 3.974444444444444e-06, "loss": -0.0386, "num_tokens": 859131.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 52.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07672496885061264, "kl": 0.11638468131422997, "learning_rate": 3.97388888888889e-06, "loss": 0.0058, "num_tokens": 859400.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 52.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08483583480119705, "kl": 0.016669567674398422, "learning_rate": 3.973333333333333e-06, "loss": 0.0008, "num_tokens": 859732.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 52.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026211494114249945, "kl": 0.09051605314016342, "learning_rate": 3.972777777777778e-06, "loss": 0.0045, "num_tokens": 860096.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 52.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015014360542409122, "kl": 5.0373375415802e-05, "learning_rate": 3.972222222222223e-06, "loss": 0.0, "num_tokens": 860316.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 52.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.19332875311374664, "kl": 0.04789129924029112, "learning_rate": 3.971666666666667e-06, "loss": 0.0024, "num_tokens": 860590.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 52.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.044333405792713165, "kl": 0.025508910417556763, "learning_rate": 3.971111111111111e-06, "loss": 0.0013, "num_tokens": 860928.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 52.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.08869900554418564, "kl": 0.014251280575990677, "learning_rate": 3.970555555555556e-06, "loss": 0.0007, "num_tokens": 861212.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 52.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.02518632635474205, "kl": 0.027658484876155853, "learning_rate": 3.97e-06, "loss": 0.0014, "num_tokens": 861428.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 52.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.009364251978695393, "kl": 0.002076647477224469, "learning_rate": 3.9694444444444445e-06, "loss": 0.0001, "num_tokens": 861740.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 52.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 5.322850704193115, "kl": 0.11113329604268074, "learning_rate": 3.96888888888889e-06, "loss": 0.0769, "num_tokens": 862040.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 52.925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 6.6712164878845215, "kl": 0.029897738248109818, "learning_rate": 3.968333333333333e-06, "loss": 0.2046, "num_tokens": 862331.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 52.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.4053316116333008, "kl": 0.0767938494682312, "learning_rate": 3.967777777777778e-06, "loss": 0.0038, "num_tokens": 862630.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 52.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09561381489038467, "kl": 0.02041938714683056, "learning_rate": 3.967222222222222e-06, "loss": 0.001, "num_tokens": 862893.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 52.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.13503406941890717, "kl": 0.027949120849370956, "learning_rate": 3.966666666666667e-06, "loss": 0.0014, "num_tokens": 863220.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.4580089747905731, "kl": 0.06766372919082642, "learning_rate": 3.9661111111111114e-06, "loss": 0.0035, "num_tokens": 863498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 53.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04767293483018875, "kl": 0.0036425372818484902, "learning_rate": 3.965555555555556e-06, "loss": 0.0002, "num_tokens": 863768.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 53.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.10876729339361191, "kl": 0.07457771524786949, "learning_rate": 3.965e-06, "loss": 0.0036, "num_tokens": 864030.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 53.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.013513030484318733, "kl": 0.17286069691181183, "learning_rate": 3.9644444444444445e-06, "loss": 0.0086, "num_tokens": 864338.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 53.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.07083126902580261, "kl": 0.04338595736771822, "learning_rate": 3.96388888888889e-06, "loss": 0.0022, "num_tokens": 864633.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.09004184603691101, "kl": 0.01954060699790716, "learning_rate": 3.963333333333333e-06, "loss": 0.001, "num_tokens": 864922.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 53.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0829634964466095, "kl": 0.018249171320348978, "learning_rate": 3.9627777777777784e-06, "loss": 0.0009, "num_tokens": 865232.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 53.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.1709134429693222, "kl": 0.012419110629707575, "learning_rate": 3.962222222222222e-06, "loss": 0.0006, "num_tokens": 865445.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 53.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.062165532261133194, "kl": 0.005792275187559426, "learning_rate": 3.961666666666667e-06, "loss": 0.0003, "num_tokens": 865758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 53.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 2.9325811862945557, "kl": 0.012037452310323715, "learning_rate": 3.9611111111111115e-06, "loss": 0.044, "num_tokens": 866073.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 53.18518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 5.06455135345459, "kl": 0.06018517538905144, "learning_rate": 3.960555555555556e-06, "loss": -0.0711, "num_tokens": 866400.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 53.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06536868959665298, "kl": 0.03701540268957615, "learning_rate": 3.96e-06, "loss": 0.0019, "num_tokens": 866761.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 53.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 3.079016923904419, "kl": 0.1534525416791439, "learning_rate": 3.9594444444444446e-06, "loss": -0.1258, "num_tokens": 867094.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 53.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005993408616632223, "kl": 0.0004240224661771208, "learning_rate": 3.958888888888889e-06, "loss": 0.0, "num_tokens": 867362.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 53.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.3052841126918793, "kl": 0.03527927491813898, "learning_rate": 3.958333333333333e-06, "loss": 0.0018, "num_tokens": 867654.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 53.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 3.6045472621917725, "kl": 0.4586468003690243, "learning_rate": 3.9577777777777785e-06, "loss": 0.0222, "num_tokens": 868018.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 53.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.5569596886634827, "kl": 0.07941199542256072, "learning_rate": 3.957222222222222e-06, "loss": 0.0046, "num_tokens": 868282.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 53.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 4.113317012786865, "kl": 0.10582764819264412, "learning_rate": 3.956666666666667e-06, "loss": -0.0772, "num_tokens": 868640.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 53.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.06679980456829071, "kl": 0.003494058968499303, "learning_rate": 3.9561111111111115e-06, "loss": 0.0002, "num_tokens": 868872.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.08589337021112442, "kl": 0.016946363262832165, "learning_rate": 3.955555555555556e-06, "loss": 0.0009, "num_tokens": 869168.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 53.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 4.996239185333252, "kl": 0.2161361575126648, "learning_rate": 3.955e-06, "loss": -0.0231, "num_tokens": 869493.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 53.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.14598383009433746, "kl": 0.022077316418290138, "learning_rate": 3.954444444444445e-06, "loss": 0.0011, "num_tokens": 869754.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 53.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.06833326816558838, "kl": 0.0556776262819767, "learning_rate": 3.953888888888889e-06, "loss": 0.0028, "num_tokens": 870206.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 53.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.10529830306768417, "kl": 0.01034916378557682, "learning_rate": 3.953333333333333e-06, "loss": 0.0007, "num_tokens": 870415.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 53.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.002496433677151799, "kl": 0.0031832645181566477, "learning_rate": 3.9527777777777785e-06, "loss": 0.0002, "num_tokens": 870711.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 53.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.17432312667369843, "kl": 0.02112838253378868, "learning_rate": 3.952222222222222e-06, "loss": 0.0011, "num_tokens": 870955.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 53.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.295614242553711, "kl": 0.11445041745901108, "learning_rate": 3.951666666666667e-06, "loss": 0.0859, "num_tokens": 871346.0, "reward": 4.375, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 2.0966243743896484, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 53.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.016462057828903198, "kl": 0.025929272174835205, "learning_rate": 3.951111111111112e-06, "loss": 0.0013, "num_tokens": 871562.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 53.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 3.7433645725250244, "kl": 0.6274641533382237, "learning_rate": 3.950555555555556e-06, "loss": 0.0277, "num_tokens": 871849.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 53.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08580091595649719, "kl": 0.035081781446933746, "learning_rate": 3.95e-06, "loss": 0.0018, "num_tokens": 872123.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.3193113803863525, "kl": 0.03794063534587622, "learning_rate": 3.949444444444445e-06, "loss": -0.0318, "num_tokens": 872452.0, "reward": 4.875, "reward_std": 3.1983067989349365, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.1983067989349365, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 53.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.006536336615681648, "kl": 0.0020538687240332365, "learning_rate": 3.948888888888889e-06, "loss": 0.0001, "num_tokens": 872734.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 53.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0056996033526957035, "kl": 0.00014308989193523303, "learning_rate": 3.948333333333333e-06, "loss": 0.0, "num_tokens": 872990.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 53.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.012194788083434105, "kl": 0.003575243055820465, "learning_rate": 3.9477777777777786e-06, "loss": 0.0002, "num_tokens": 873250.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 53.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001488080306444317, "kl": 5.0455331802368164e-05, "learning_rate": 3.947222222222222e-06, "loss": 0.0, "num_tokens": 873470.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 53.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016201743856072426, "kl": 1.7061829566955566e-05, "learning_rate": 3.946666666666667e-06, "loss": 0.0, "num_tokens": 873682.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 53.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028799138963222504, "kl": 0.09039357304573059, "learning_rate": 3.946111111111112e-06, "loss": 0.0045, "num_tokens": 874046.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 53.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.1278209090232849, "kl": 0.03504197299480438, "learning_rate": 3.945555555555556e-06, "loss": 0.0018, "num_tokens": 874346.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10253849625587463, "kl": 0.013759092427790165, "learning_rate": 3.945e-06, "loss": 0.0007, "num_tokens": 874630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 53.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.21811646223068237, "kl": 0.07323556579649448, "learning_rate": 3.944444444444445e-06, "loss": 0.0037, "num_tokens": 874978.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 53.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.011868991889059544, "kl": 0.0031711275223642588, "learning_rate": 3.943888888888889e-06, "loss": 0.0002, "num_tokens": 875262.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 53.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03533748909831047, "kl": 0.018739304039627314, "learning_rate": 3.943333333333333e-06, "loss": 0.0008, "num_tokens": 875588.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 53.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.003941490780562162, "kl": 0.010595008730888367, "learning_rate": 3.942777777777778e-06, "loss": 0.0005, "num_tokens": 875824.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 53.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.8738608360290527, "kl": 0.24246317893266678, "learning_rate": 3.942222222222222e-06, "loss": 0.0425, "num_tokens": 876130.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 53.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.14452314376831055, "kl": 0.018854759633541107, "learning_rate": 3.941666666666667e-06, "loss": 0.0009, "num_tokens": 876396.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 53.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.531822919845581, "kl": 0.04704783298075199, "learning_rate": 3.941111111111112e-06, "loss": 0.0022, "num_tokens": 876724.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 53.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 2.9411041736602783, "kl": 0.0693642906844616, "learning_rate": 3.940555555555556e-06, "loss": -0.0794, "num_tokens": 877036.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 53.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.006281669717282057, "kl": 0.0012271690065972507, "learning_rate": 3.94e-06, "loss": 0.0001, "num_tokens": 877355.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 53.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.04204998537898064, "kl": 0.009921287652105093, "learning_rate": 3.939444444444445e-06, "loss": 0.0005, "num_tokens": 877678.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 53.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.03255746513605118, "kl": 0.00240558193763718, "learning_rate": 3.938888888888889e-06, "loss": 0.0001, "num_tokens": 877940.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 53.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.038143694400787354, "kl": 0.0035125897265970707, "learning_rate": 3.9383333333333335e-06, "loss": 0.0002, "num_tokens": 878216.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 53.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.6581926345825195, "kl": 0.023105616121029016, "learning_rate": 3.937777777777778e-06, "loss": 0.0036, "num_tokens": 878436.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 53.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04578632488846779, "kl": 0.011655998881906271, "learning_rate": 3.937222222222222e-06, "loss": 0.0006, "num_tokens": 878764.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 53.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.10539723187685013, "kl": 0.025798041373491287, "learning_rate": 3.936666666666667e-06, "loss": 0.0015, "num_tokens": 879031.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 54.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.076833963394165, "kl": 0.053358044475317, "learning_rate": 3.936111111111112e-06, "loss": 0.3654, "num_tokens": 879360.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 54.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.13295413553714752, "kl": 0.047185229137539864, "learning_rate": 3.935555555555556e-06, "loss": 0.0024, "num_tokens": 879628.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 71.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 71.5, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 54.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.123688220977783, "kl": 0.013536912389099598, "learning_rate": 3.9350000000000004e-06, "loss": 0.4626, "num_tokens": 880134.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 54.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.12168178707361221, "kl": 0.03980989754199982, "learning_rate": 3.934444444444445e-06, "loss": 0.002, "num_tokens": 880468.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 54.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.029495282098650932, "kl": 0.000487402081489563, "learning_rate": 3.933888888888889e-06, "loss": 0.0, "num_tokens": 880680.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 11.454000473022461, "kl": 0.2116695549339056, "learning_rate": 3.9333333333333335e-06, "loss": 0.0611, "num_tokens": 880987.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 54.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001509602734586224, "kl": 5.065649747848511e-05, "learning_rate": 3.932777777777778e-06, "loss": 0.0, "num_tokens": 881207.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.03366344794631004, "kl": 0.004462434444576502, "learning_rate": 3.932222222222222e-06, "loss": 0.0002, "num_tokens": 881491.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 54.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.01592978835105896, "kl": 0.0017031601164489985, "learning_rate": 3.9316666666666666e-06, "loss": 0.0001, "num_tokens": 881800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 54.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.018601762130856514, "kl": 0.17010696232318878, "learning_rate": 3.931111111111112e-06, "loss": 0.0085, "num_tokens": 882109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 54.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.43093422055244446, "kl": 0.03600211860612035, "learning_rate": 3.930555555555556e-06, "loss": 0.0015, "num_tokens": 882383.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 54.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02156308852136135, "kl": 0.013441833667457104, "learning_rate": 3.9300000000000005e-06, "loss": 0.0007, "num_tokens": 882690.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 54.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.05623588711023331, "kl": 0.0574048962444067, "learning_rate": 3.929444444444445e-06, "loss": 0.0028, "num_tokens": 882959.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 54.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00134828791487962, "kl": 0.001854666625149548, "learning_rate": 3.928888888888889e-06, "loss": 0.0001, "num_tokens": 883239.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.0970587730407715, "kl": 0.04175542667508125, "learning_rate": 3.9283333333333336e-06, "loss": 0.2515, "num_tokens": 883536.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 54.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 10.30145263671875, "kl": 0.04932282492518425, "learning_rate": 3.927777777777778e-06, "loss": 0.3446, "num_tokens": 883764.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 54.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037002423778176308, "kl": 0.010649144649505615, "learning_rate": 3.927222222222222e-06, "loss": 0.0005, "num_tokens": 884000.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 54.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.06890930235385895, "kl": 0.03442916460335255, "learning_rate": 3.926666666666667e-06, "loss": 0.0017, "num_tokens": 884354.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 54.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 5.88813591003418, "kl": 0.17975660413503647, "learning_rate": 3.926111111111112e-06, "loss": 0.1024, "num_tokens": 884727.0, "reward": 3.875, "reward_std": 3.092329263687134, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 3.092329263687134, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 54.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0113774249330163, "kl": 0.0029114149510860443, "learning_rate": 3.925555555555556e-06, "loss": 0.0001, "num_tokens": 884971.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 54.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.023120718076825142, "kl": 0.0021905270405113697, "learning_rate": 3.9250000000000005e-06, "loss": 0.0001, "num_tokens": 885231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 54.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.003933645784854889, "kl": 0.00022301673743641004, "learning_rate": 3.924444444444445e-06, "loss": 0.0, "num_tokens": 885451.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 54.407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.3808341026306152, "kl": 0.0388447642326355, "learning_rate": 3.923888888888889e-06, "loss": 0.0232, "num_tokens": 885773.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 54.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.018533479422330856, "kl": 0.001703758374787867, "learning_rate": 3.923333333333334e-06, "loss": 0.0001, "num_tokens": 886041.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 54.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 5.683305263519287, "kl": 0.3697463534772396, "learning_rate": 3.922777777777778e-06, "loss": 0.2373, "num_tokens": 886266.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 54.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01684139296412468, "kl": 0.058610834181308746, "learning_rate": 3.922222222222223e-06, "loss": 0.0029, "num_tokens": 886718.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 54.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05306917801499367, "kl": 0.012219988275319338, "learning_rate": 3.921666666666667e-06, "loss": 0.0006, "num_tokens": 887008.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 54.5, "frac_reward_zero_std": 1.0, "grad_norm": 4.323760986328125, "kl": 0.4119283854961395, "learning_rate": 3.921111111111112e-06, "loss": 0.0192, "num_tokens": 887311.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 54.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.006040235050022602, "kl": 0.00036914860538672656, "learning_rate": 3.920555555555555e-06, "loss": 0.0, "num_tokens": 887547.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 54.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.6762001514434814, "kl": 0.05446097254753113, "learning_rate": 3.920000000000001e-06, "loss": 0.0205, "num_tokens": 887859.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 54.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.9644999504089355, "kl": 0.14624780789017677, "learning_rate": 3.919444444444445e-06, "loss": 0.0474, "num_tokens": 888201.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 54.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030961656011641026, "kl": 0.0012062887544743717, "learning_rate": 3.918888888888889e-06, "loss": 0.0001, "num_tokens": 888521.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 54.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.04458121582865715, "kl": 0.011082459706813097, "learning_rate": 3.918333333333334e-06, "loss": 0.0006, "num_tokens": 888793.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 54.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.007351330015808344, "kl": 0.0005617588758468628, "learning_rate": 3.917777777777778e-06, "loss": 0.0, "num_tokens": 889005.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 54.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.1325899362564087, "kl": 0.04281844198703766, "learning_rate": 3.917222222222223e-06, "loss": 0.0022, "num_tokens": 889323.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2950 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 3.910425901412964, "kl": 0.10100420750677586, "learning_rate": 3.916666666666667e-06, "loss": -0.0907, "num_tokens": 889607.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 54.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.02294129692018032, "kl": 0.005689134355634451, "learning_rate": 3.916111111111112e-06, "loss": 0.0003, "num_tokens": 889905.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 54.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.1543031632900238, "kl": 0.06630479544401169, "learning_rate": 3.9155555555555554e-06, "loss": 0.0033, "num_tokens": 890242.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 54.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09015223383903503, "kl": 0.0231318362057209, "learning_rate": 3.915000000000001e-06, "loss": 0.0012, "num_tokens": 890529.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 54.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.01396949216723442, "kl": 0.00024338364164577797, "learning_rate": 3.914444444444445e-06, "loss": 0.0, "num_tokens": 890785.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 54.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.3535214364528656, "kl": 0.05132490023970604, "learning_rate": 3.913888888888889e-06, "loss": 0.0026, "num_tokens": 891129.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 54.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05438661947846413, "kl": 0.01112218271009624, "learning_rate": 3.913333333333334e-06, "loss": 0.0006, "num_tokens": 891465.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 54.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.007121377624571323, "kl": 0.016176234930753708, "learning_rate": 3.912777777777778e-06, "loss": 0.0008, "num_tokens": 891777.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 54.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05116831138730049, "kl": 0.0016901682247407734, "learning_rate": 3.912222222222222e-06, "loss": 0.0001, "num_tokens": 892043.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 54.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.10477099567651749, "kl": 0.26566174626350403, "learning_rate": 3.911666666666667e-06, "loss": 0.0133, "num_tokens": 892341.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 54.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 3.6946282386779785, "kl": 0.6654970310628414, "learning_rate": 3.911111111111112e-06, "loss": 0.034, "num_tokens": 892674.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 54.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.1284608244895935, "kl": 0.010985451750457287, "learning_rate": 3.9105555555555555e-06, "loss": 0.0006, "num_tokens": 892928.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 54.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.002515350701287389, "kl": 0.0904296264052391, "learning_rate": 3.910000000000001e-06, "loss": 0.0045, "num_tokens": 893292.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 54.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0943041667342186, "kl": 0.012462898739613593, "learning_rate": 3.909444444444444e-06, "loss": 0.0007, "num_tokens": 893556.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.0078125, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 54.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 3.217656135559082, "kl": 0.17843125015497208, "learning_rate": 3.908888888888889e-06, "loss": 0.0712, "num_tokens": 893892.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 54.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.024715004488825798, "kl": 0.08214797452092171, "learning_rate": 3.908333333333334e-06, "loss": 0.0041, "num_tokens": 894257.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 54.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04459110274910927, "kl": 0.011200563050806522, "learning_rate": 3.907777777777778e-06, "loss": 0.0006, "num_tokens": 894583.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 54.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.007651137188076973, "kl": 0.01706084329634905, "learning_rate": 3.9072222222222225e-06, "loss": 0.0009, "num_tokens": 894843.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 54.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04180046543478966, "kl": 0.009601094294339418, "learning_rate": 3.906666666666667e-06, "loss": 0.0005, "num_tokens": 895131.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 55.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.05773744359612465, "kl": 0.0052864812314510345, "learning_rate": 3.906111111111112e-06, "loss": 0.0003, "num_tokens": 895391.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 55.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.5744805335998535, "kl": 0.13946779817342758, "learning_rate": 3.9055555555555555e-06, "loss": 0.1443, "num_tokens": 895776.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 55.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.015075206756592, "kl": 0.07694989815354347, "learning_rate": 3.905000000000001e-06, "loss": -0.0312, "num_tokens": 896090.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 55.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04710201919078827, "kl": 0.04806765541434288, "learning_rate": 3.904444444444444e-06, "loss": 0.0024, "num_tokens": 896450.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 55.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 4.484079360961914, "kl": 1.0156061984598637, "learning_rate": 3.9038888888888894e-06, "loss": -0.0898, "num_tokens": 896831.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 55.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.05592005327343941, "kl": 0.010979237966239452, "learning_rate": 3.903333333333334e-06, "loss": 0.0005, "num_tokens": 897103.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 55.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.36799609661102295, "kl": 0.10151886194944382, "learning_rate": 3.902777777777778e-06, "loss": 0.0052, "num_tokens": 897483.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 55.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001341558963758871, "kl": 5.589425563812256e-05, "learning_rate": 3.9022222222222225e-06, "loss": 0.0, "num_tokens": 897703.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 55.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.015517124906182289, "kl": 0.002069992362521589, "learning_rate": 3.901666666666667e-06, "loss": 0.0001, "num_tokens": 897975.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 55.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.011681023985147476, "kl": 0.0006228422862477601, "learning_rate": 3.901111111111111e-06, "loss": 0.0, "num_tokens": 898210.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 55.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.013627220876514912, "kl": 0.0022198037477210164, "learning_rate": 3.9005555555555556e-06, "loss": 0.0001, "num_tokens": 898531.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 55.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10667815059423447, "kl": 0.02333084214478731, "learning_rate": 3.900000000000001e-06, "loss": 0.0011, "num_tokens": 898838.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2981 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 55.22222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 2.4844229221343994, "kl": 0.01708794431760907, "learning_rate": 3.899444444444444e-06, "loss": 0.0597, "num_tokens": 899165.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 55.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.038148749619722366, "kl": 0.006309555727057159, "learning_rate": 3.8988888888888895e-06, "loss": 0.0003, "num_tokens": 899453.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 55.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07131102681159973, "kl": 0.014914346858859062, "learning_rate": 3.898333333333333e-06, "loss": 0.0007, "num_tokens": 899751.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 55.27777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.697399139404297, "kl": 0.19623763859272003, "learning_rate": 3.897777777777778e-06, "loss": 0.0132, "num_tokens": 900091.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 55.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03902651369571686, "kl": 0.01740108523517847, "learning_rate": 3.8972222222222226e-06, "loss": 0.0009, "num_tokens": 900367.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 55.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.14646245539188385, "kl": 0.0536044966429472, "learning_rate": 3.896666666666667e-06, "loss": 0.0026, "num_tokens": 900710.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 55.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.011171757243573666, "kl": 0.0018432681681588292, "learning_rate": 3.896111111111111e-06, "loss": 0.0001, "num_tokens": 901006.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 55.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.04360317066311836, "kl": 0.028591222129762173, "learning_rate": 3.895555555555556e-06, "loss": 0.0014, "num_tokens": 901338.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 55.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.004236945882439613, "kl": 0.0012848198530264199, "learning_rate": 3.895000000000001e-06, "loss": 0.0001, "num_tokens": 901598.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 55.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.06962081044912338, "kl": 0.07316041179001331, "learning_rate": 3.894444444444444e-06, "loss": 0.0036, "num_tokens": 902042.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 55.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.028188731521368027, "kl": 0.004186376929283142, "learning_rate": 3.8938888888888895e-06, "loss": 0.0002, "num_tokens": 902302.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 55.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.010125330649316311, "kl": 0.0020540431141853333, "learning_rate": 3.893333333333333e-06, "loss": 0.0001, "num_tokens": 902546.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 55.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.007582643069326878, "kl": 0.01596122607588768, "learning_rate": 3.892777777777778e-06, "loss": 0.0008, "num_tokens": 902858.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 55.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06368894875049591, "kl": 0.024840449914336205, "learning_rate": 3.892222222222223e-06, "loss": 0.0012, "num_tokens": 903142.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 55.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05495568737387657, "kl": 0.16936735808849335, "learning_rate": 3.891666666666667e-06, "loss": 0.0085, "num_tokens": 903452.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 55.5, "frac_reward_zero_std": 0.0, "grad_norm": 7.555488586425781, "kl": 0.24601907283067703, "learning_rate": 3.891111111111111e-06, "loss": 0.0277, "num_tokens": 903733.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 55.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.017251459881663322, "kl": 0.0037591943982988596, "learning_rate": 3.890555555555556e-06, "loss": 0.0002, "num_tokens": 904001.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 55.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06784974783658981, "kl": 0.009894698392599821, "learning_rate": 3.89e-06, "loss": 0.0004, "num_tokens": 904285.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 55.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02368985116481781, "kl": 0.013432743959128857, "learning_rate": 3.889444444444444e-06, "loss": 0.0007, "num_tokens": 904591.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 55.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.06957027316093445, "kl": 0.005784003529697657, "learning_rate": 3.88888888888889e-06, "loss": 0.0003, "num_tokens": 904851.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 55.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.00748987914994359, "kl": 0.017080138437449932, "learning_rate": 3.888333333333333e-06, "loss": 0.0009, "num_tokens": 905111.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 55.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.001089235651306808, "kl": 0.0020439354120753706, "learning_rate": 3.887777777777778e-06, "loss": 0.0001, "num_tokens": 905388.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 55.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.09189894050359726, "kl": 0.006672309013083577, "learning_rate": 3.887222222222223e-06, "loss": 0.0004, "num_tokens": 905612.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 55.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.005576233379542828, "kl": 0.03148340433835983, "learning_rate": 3.886666666666667e-06, "loss": 0.0016, "num_tokens": 905880.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 55.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.11248769611120224, "kl": 0.057430244982242584, "learning_rate": 3.886111111111111e-06, "loss": 0.0032, "num_tokens": 906146.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 55.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0883115753531456, "kl": 0.09654708206653595, "learning_rate": 3.885555555555556e-06, "loss": 0.0048, "num_tokens": 906517.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 55.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04770127683877945, "kl": 0.0011669412488117814, "learning_rate": 3.885e-06, "loss": 0.0001, "num_tokens": 906730.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3008 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 55.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 5.039653778076172, "kl": 0.07505093514919281, "learning_rate": 3.8844444444444444e-06, "loss": 0.2707, "num_tokens": 907045.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 3009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 55.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005495330784469843, "kl": 0.003018706920556724, "learning_rate": 3.88388888888889e-06, "loss": 0.0002, "num_tokens": 907329.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 55.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.7567954063415527, "kl": 0.02058031503111124, "learning_rate": 3.883333333333333e-06, "loss": 0.1569, "num_tokens": 907681.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 3011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 55.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.09056897461414337, "kl": 0.01312174554914236, "learning_rate": 3.882777777777778e-06, "loss": 0.0007, "num_tokens": 908002.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 55.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004796093329787254, "kl": 0.010474525392055511, "learning_rate": 3.882222222222223e-06, "loss": 0.0005, "num_tokens": 908238.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 55.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.08455026894807816, "kl": 0.0278202835470438, "learning_rate": 3.881666666666667e-06, "loss": 0.0014, "num_tokens": 908465.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 55.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.003995392471551895, "kl": 0.08997517079114914, "learning_rate": 3.881111111111111e-06, "loss": 0.0045, "num_tokens": 908829.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 55.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.01968640647828579, "kl": 0.0029832004802301526, "learning_rate": 3.880555555555556e-06, "loss": 0.0001, "num_tokens": 909138.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 55.870370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 3.384733200073242, "kl": 0.11250351369380951, "learning_rate": 3.88e-06, "loss": 0.1449, "num_tokens": 909490.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 55.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.014647174626588821, "kl": 0.2399420365691185, "learning_rate": 3.8794444444444445e-06, "loss": 0.012, "num_tokens": 909790.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 55.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.00540969567373395, "kl": 0.00020163953013252467, "learning_rate": 3.87888888888889e-06, "loss": 0.0, "num_tokens": 910046.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 55.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.14327608048915863, "kl": 0.01824752241373062, "learning_rate": 3.878333333333333e-06, "loss": 0.0008, "num_tokens": 910318.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 55.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.21170519292354584, "kl": 0.007326267659664154, "learning_rate": 3.877777777777778e-06, "loss": 0.0005, "num_tokens": 910528.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 55.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.007356606423854828, "kl": 0.0007207244634628296, "learning_rate": 3.877222222222223e-06, "loss": 0.0, "num_tokens": 910740.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 55.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0481148324906826, "kl": 0.0038022411754354835, "learning_rate": 3.876666666666667e-06, "loss": 0.0002, "num_tokens": 911004.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 56.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.1367248147726059, "kl": 0.04435145575553179, "learning_rate": 3.8761111111111115e-06, "loss": 0.0023, "num_tokens": 911269.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 56.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.004584771115332842, "kl": 0.010506555438041687, "learning_rate": 3.875555555555556e-06, "loss": 0.0005, "num_tokens": 911505.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.055176019668579, "kl": 0.05478322505950928, "learning_rate": 3.875e-06, "loss": -0.0711, "num_tokens": 911794.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 56.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 2.3425745964050293, "kl": 0.20671368390321732, "learning_rate": 3.8744444444444445e-06, "loss": 0.0103, "num_tokens": 912062.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 56.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.07365432381629944, "kl": 0.015656953677535057, "learning_rate": 3.873888888888889e-06, "loss": 0.0008, "num_tokens": 912323.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 56.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.012073429301381111, "kl": 0.0007246931490954012, "learning_rate": 3.873333333333333e-06, "loss": 0.0, "num_tokens": 912558.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 56.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.17007306218147278, "kl": 0.03656391240656376, "learning_rate": 3.8727777777777784e-06, "loss": 0.0019, "num_tokens": 912844.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 56.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.016585268080234528, "kl": 0.22712577879428864, "learning_rate": 3.872222222222223e-06, "loss": 0.0113, "num_tokens": 913146.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 56.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.20655399560928345, "kl": 0.025229910388588905, "learning_rate": 3.871666666666667e-06, "loss": 0.0012, "num_tokens": 913461.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 56.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0429355651140213, "kl": 0.01513968501240015, "learning_rate": 3.8711111111111115e-06, "loss": 0.0008, "num_tokens": 913737.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0914241299033165, "kl": 0.015846097376197577, "learning_rate": 3.870555555555556e-06, "loss": 0.0008, "num_tokens": 914009.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 56.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04734130576252937, "kl": 0.029023083858191967, "learning_rate": 3.87e-06, "loss": 0.0014, "num_tokens": 914337.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 56.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.04199639707803726, "kl": 0.008081597508862615, "learning_rate": 3.8694444444444446e-06, "loss": 0.0005, "num_tokens": 914660.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 56.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.31345134973526, "kl": 0.062415961176157, "learning_rate": 3.868888888888889e-06, "loss": 0.0032, "num_tokens": 914989.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3037 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.011904762126505375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011904762126505375, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 56.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.173439025878906, "kl": 0.24263654509559274, "learning_rate": 3.868333333333333e-06, "loss": 0.0578, "num_tokens": 915316.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 56.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.01041572168469429, "kl": 0.0011517132297740318, "learning_rate": 3.8677777777777785e-06, "loss": 0.0001, "num_tokens": 915588.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03024487942457199, "kl": 0.027227291837334633, "learning_rate": 3.867222222222223e-06, "loss": 0.0014, "num_tokens": 915856.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 56.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.09514685720205307, "kl": 0.007777459919452667, "learning_rate": 3.866666666666667e-06, "loss": 0.0004, "num_tokens": 916064.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 56.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 3.8006458282470703, "kl": 0.03711853176355362, "learning_rate": 3.8661111111111116e-06, "loss": 0.1896, "num_tokens": 916415.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 3042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 56.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.009008049964904785, "kl": 0.015521196648478508, "learning_rate": 3.865555555555556e-06, "loss": 0.0008, "num_tokens": 916727.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 56.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.05179315432906151, "kl": 0.004793073982000351, "learning_rate": 3.865e-06, "loss": 0.0002, "num_tokens": 916979.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 56.388888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 2.8798422813415527, "kl": 0.1585402935743332, "learning_rate": 3.864444444444445e-06, "loss": -0.0429, "num_tokens": 917300.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 56.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.2922925651073456, "kl": 0.11648717522621155, "learning_rate": 3.863888888888889e-06, "loss": 0.0057, "num_tokens": 917639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 56.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.005681642796844244, "kl": 0.0012512803077697754, "learning_rate": 3.863333333333333e-06, "loss": 0.0001, "num_tokens": 917899.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 56.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09901385009288788, "kl": 0.01380264526233077, "learning_rate": 3.862777777777778e-06, "loss": 0.0007, "num_tokens": 918197.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 56.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04628486558794975, "kl": 0.004523653537034988, "learning_rate": 3.862222222222223e-06, "loss": 0.0002, "num_tokens": 918457.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 56.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.2643211781978607, "kl": 0.049426451325416565, "learning_rate": 3.861666666666667e-06, "loss": 0.0026, "num_tokens": 918721.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009803921915590763, "clip_ratio/low_min": 0.009803921915590763, "clip_ratio/region_mean": 0.009803921915590763, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 56.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.659407615661621, "kl": 0.07365174032747746, "learning_rate": 3.861111111111112e-06, "loss": -0.1132, "num_tokens": 919024.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 3051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 56.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.12664934992790222, "kl": 0.08011163398623466, "learning_rate": 3.860555555555556e-06, "loss": 0.004, "num_tokens": 919472.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 56.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 6.1213059425354, "kl": 0.03503846377134323, "learning_rate": 3.86e-06, "loss": -0.0597, "num_tokens": 919810.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 56.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013420831237453967, "kl": 5.433708429336548e-05, "learning_rate": 3.859444444444445e-06, "loss": 0.0, "num_tokens": 920030.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 56.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 3.606114625930786, "kl": 0.012096640886738896, "learning_rate": 3.858888888888889e-06, "loss": 0.4896, "num_tokens": 920306.0, "reward": 1.75, "reward_std": 2.872281312942505, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 2.872281312942505, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 56.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.04924893006682396, "kl": 0.0029673469834960997, "learning_rate": 3.858333333333333e-06, "loss": 0.0001, "num_tokens": 920584.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 56.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.018768643960356712, "kl": 0.0003398805856704712, "learning_rate": 3.857777777777778e-06, "loss": 0.0, "num_tokens": 920796.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0601140521466732, "kl": 0.006662430940195918, "learning_rate": 3.857222222222223e-06, "loss": 0.0003, "num_tokens": 921078.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004545454401522875, "clip_ratio/low_min": 0.004545454401522875, "clip_ratio/region_mean": 0.004545454401522875, "completion_length": 52.75, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 56.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 1.6388843059539795, "kl": 0.07160451635718346, "learning_rate": 3.856666666666667e-06, "loss": 0.1974, "num_tokens": 921513.0, "reward": 4.125, "reward_std": 2.25, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 2.25, "step": 3059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 56.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.2770049571990967, "kl": 0.05610385350883007, "learning_rate": 3.856111111111112e-06, "loss": 0.0024, "num_tokens": 921812.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 56.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.012389760464429855, "kl": 0.0166018083691597, "learning_rate": 3.855555555555556e-06, "loss": 0.0008, "num_tokens": 922084.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 56.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10173963755369186, "kl": 0.005183643370401114, "learning_rate": 3.855e-06, "loss": 0.0003, "num_tokens": 922340.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 56.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.011526980437338352, "kl": 0.002414841204881668, "learning_rate": 3.854444444444445e-06, "loss": 0.0001, "num_tokens": 922584.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 56.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00369492219761014, "kl": 0.09002915024757385, "learning_rate": 3.853888888888889e-06, "loss": 0.0045, "num_tokens": 922948.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.008312960155308247, "kl": 0.0023187819169834256, "learning_rate": 3.853333333333334e-06, "loss": 0.0001, "num_tokens": 923244.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 56.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162838563323021, "kl": 0.013875847216695547, "learning_rate": 3.852777777777778e-06, "loss": 0.0007, "num_tokens": 923534.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 56.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.09339672327041626, "kl": 0.03055623173713684, "learning_rate": 3.852222222222223e-06, "loss": 0.0015, "num_tokens": 923753.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 56.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.11084160953760147, "kl": 0.12137174606323242, "learning_rate": 3.8516666666666665e-06, "loss": 0.006, "num_tokens": 924128.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 56.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 2.549938678741455, "kl": 0.09946725144982338, "learning_rate": 3.851111111111112e-06, "loss": 0.0074, "num_tokens": 924492.0, "reward": 2.75, "reward_std": 1.1902379989624023, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.190238118171692, "step": 3069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 56.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.006009580567479134, "kl": 0.002099672914482653, "learning_rate": 3.850555555555556e-06, "loss": 0.0001, "num_tokens": 924769.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 56.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.00939005147665739, "kl": 0.0020171570358797908, "learning_rate": 3.85e-06, "loss": 0.0001, "num_tokens": 925081.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 56.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.018364179879426956, "kl": 0.16990716010332108, "learning_rate": 3.849444444444445e-06, "loss": 0.0085, "num_tokens": 925390.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 56.907407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 4.2027387619018555, "kl": 0.10182570293545723, "learning_rate": 3.848888888888889e-06, "loss": -0.1067, "num_tokens": 925703.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 3073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 56.925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 5.09504508972168, "kl": 0.02777761220932007, "learning_rate": 3.848333333333334e-06, "loss": 0.46, "num_tokens": 925957.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 3074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 56.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03029530867934227, "kl": 0.005479628220200539, "learning_rate": 3.847777777777778e-06, "loss": 0.0003, "num_tokens": 926227.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 56.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.042268816381692886, "kl": 0.026917098090052605, "learning_rate": 3.847222222222223e-06, "loss": 0.0015, "num_tokens": 926497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 56.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.5566272735595703, "kl": 0.2814135104417801, "learning_rate": 3.8466666666666665e-06, "loss": 0.0085, "num_tokens": 926827.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 3077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 57.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.1536571979522705, "kl": 0.009907664498314261, "learning_rate": 3.846111111111112e-06, "loss": -0.0108, "num_tokens": 927156.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 57.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.013219021260738373, "kl": 0.01624482497572899, "learning_rate": 3.845555555555556e-06, "loss": 0.0008, "num_tokens": 927428.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 57.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.18065954744815826, "kl": 0.04246322996914387, "learning_rate": 3.8450000000000005e-06, "loss": 0.0022, "num_tokens": 927728.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 57.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 12.930730819702148, "kl": 1.7492011040449142, "learning_rate": 3.844444444444445e-06, "loss": 0.1625, "num_tokens": 928077.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 57.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.04393419995903969, "kl": 0.009916965500451624, "learning_rate": 3.843888888888889e-06, "loss": 0.0005, "num_tokens": 928413.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 57.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 1.839030385017395, "kl": 0.3062959909439087, "learning_rate": 3.8433333333333335e-06, "loss": 0.014, "num_tokens": 928684.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 57.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013811553071718663, "kl": 5.3316354751586914e-05, "learning_rate": 3.842777777777778e-06, "loss": 0.0, "num_tokens": 928904.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 57.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 3.7319676876068115, "kl": 0.027467001229524612, "learning_rate": 3.842222222222223e-06, "loss": -0.0159, "num_tokens": 929221.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 57.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.006507848389446735, "kl": 0.0020477367797866464, "learning_rate": 3.841666666666667e-06, "loss": 0.0001, "num_tokens": 929498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 57.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.03505842015147209, "kl": 0.01102894707582891, "learning_rate": 3.841111111111112e-06, "loss": 0.0006, "num_tokens": 929771.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 57.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.017320575192570686, "kl": 0.0010679967235773802, "learning_rate": 3.840555555555555e-06, "loss": 0.0001, "num_tokens": 930006.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 57.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02233324944972992, "kl": 0.02795899659395218, "learning_rate": 3.8400000000000005e-06, "loss": 0.0014, "num_tokens": 930225.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 57.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.021271761506795883, "kl": 0.00044836103916168213, "learning_rate": 3.839444444444445e-06, "loss": 0.0, "num_tokens": 930481.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 57.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.017376994714140892, "kl": 0.21506811678409576, "learning_rate": 3.838888888888889e-06, "loss": 0.0108, "num_tokens": 930785.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 57.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.014967762865126133, "kl": 0.0003831898211501539, "learning_rate": 3.8383333333333336e-06, "loss": 0.0, "num_tokens": 931055.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 57.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.03669033199548721, "kl": 0.0064917819108814, "learning_rate": 3.837777777777778e-06, "loss": 0.0003, "num_tokens": 931363.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 57.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.028558174148201942, "kl": 0.0005108952464070171, "learning_rate": 3.837222222222223e-06, "loss": 0.0, "num_tokens": 931576.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 57.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.6449735760688782, "kl": 0.05864404421299696, "learning_rate": 3.836666666666667e-06, "loss": 0.0029, "num_tokens": 931836.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 57.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 2.340714931488037, "kl": 0.22766504436731339, "learning_rate": 3.836111111111112e-06, "loss": 0.021, "num_tokens": 932167.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 57.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.004089204128831625, "kl": 0.089933842420578, "learning_rate": 3.835555555555555e-06, "loss": 0.0045, "num_tokens": 932531.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 57.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.47624871134757996, "kl": 0.05520443618297577, "learning_rate": 3.8350000000000006e-06, "loss": 0.0029, "num_tokens": 932776.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 57.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.07929404079914093, "kl": 0.0025946447858586907, "learning_rate": 3.834444444444445e-06, "loss": 0.0001, "num_tokens": 932992.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 57.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.014453653246164322, "kl": 0.004588592215441167, "learning_rate": 3.833888888888889e-06, "loss": 0.0002, "num_tokens": 933280.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 57.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.15802249312400818, "kl": 0.026516391895711422, "learning_rate": 3.833333333333334e-06, "loss": 0.0014, "num_tokens": 933562.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 57.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 1.469797134399414, "kl": 0.05990342050790787, "learning_rate": 3.832777777777778e-06, "loss": 0.0141, "num_tokens": 934016.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 3102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 57.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04068932309746742, "kl": 0.010883231647312641, "learning_rate": 3.832222222222222e-06, "loss": 0.0006, "num_tokens": 934314.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 57.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0084369583055377, "kl": 0.015241451561450958, "learning_rate": 3.831666666666667e-06, "loss": 0.0008, "num_tokens": 934628.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 57.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.00842549279332161, "kl": 0.003870982676744461, "learning_rate": 3.831111111111112e-06, "loss": 0.0002, "num_tokens": 934888.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 57.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.039917927235364914, "kl": 0.007463926216587424, "learning_rate": 3.830555555555555e-06, "loss": 0.0004, "num_tokens": 935180.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 57.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.440314769744873, "kl": 0.17232653312385082, "learning_rate": 3.830000000000001e-06, "loss": 0.067, "num_tokens": 935535.0, "reward": 2.5, "reward_std": 4.020779132843018, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 4.020779609680176, "step": 3107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 57.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 6.13159704208374, "kl": 0.03764951601624489, "learning_rate": 3.829444444444444e-06, "loss": 0.0818, "num_tokens": 935804.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 57.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0980510264635086, "kl": 0.019726875238120556, "learning_rate": 3.828888888888889e-06, "loss": 0.001, "num_tokens": 936109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 57.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751933827996254, "kl": 0.007611898705363274, "learning_rate": 3.828333333333334e-06, "loss": 0.0004, "num_tokens": 936375.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 57.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022090740967541933, "kl": 0.0027748276479542255, "learning_rate": 3.827777777777778e-06, "loss": 0.0001, "num_tokens": 936659.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 57.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.028971143066883087, "kl": 0.039686091244220734, "learning_rate": 3.827222222222222e-06, "loss": 0.002, "num_tokens": 936987.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 57.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 4.263341903686523, "kl": 0.05371707025915384, "learning_rate": 3.826666666666667e-06, "loss": 0.2889, "num_tokens": 937379.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 57.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.009434322826564312, "kl": 0.0015786951989866793, "learning_rate": 3.826111111111112e-06, "loss": 0.0001, "num_tokens": 937691.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 57.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.011202361434698105, "kl": 0.016243749298155308, "learning_rate": 3.8255555555555554e-06, "loss": 0.0008, "num_tokens": 937951.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 57.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.622066497802734, "kl": 0.024163072579540312, "learning_rate": 3.825000000000001e-06, "loss": 0.0194, "num_tokens": 938212.0, "reward": 2.25, "reward_std": 2.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 2.5, "step": 3116 }, { "clip_ratio/high_max": 0.009999999776482582, "clip_ratio/high_mean": 0.009999999776482582, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009999999776482582, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 57.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.4846272468566895, "kl": 0.15811453014612198, "learning_rate": 3.824444444444444e-06, "loss": 0.0048, "num_tokens": 938539.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 3117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 57.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05461859703063965, "kl": 0.03529401682317257, "learning_rate": 3.823888888888889e-06, "loss": 0.0018, "num_tokens": 938853.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 57.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08341030776500702, "kl": 0.008575459942221642, "learning_rate": 3.823333333333334e-06, "loss": 0.0004, "num_tokens": 939137.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 57.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.04968341439962387, "kl": 0.030618852004408836, "learning_rate": 3.822777777777778e-06, "loss": 0.0015, "num_tokens": 939474.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 57.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.5271922945976257, "kl": 0.14541714638471603, "learning_rate": 3.8222222222222224e-06, "loss": 0.0074, "num_tokens": 939834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 57.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.04601936787366867, "kl": 0.0018394782673567533, "learning_rate": 3.821666666666667e-06, "loss": 0.0001, "num_tokens": 940053.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 57.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.011326128616929054, "kl": 0.0005384832620620728, "learning_rate": 3.821111111111111e-06, "loss": 0.0, "num_tokens": 940333.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 57.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0434051938354969, "kl": 0.0050515830516815186, "learning_rate": 3.8205555555555555e-06, "loss": 0.0003, "num_tokens": 940541.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 57.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.11807039380073547, "kl": 0.039512683637440205, "learning_rate": 3.820000000000001e-06, "loss": 0.0023, "num_tokens": 940870.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 57.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.05849478766322136, "kl": 0.002514933526981622, "learning_rate": 3.819444444444444e-06, "loss": 0.0001, "num_tokens": 941126.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 57.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.10233496129512787, "kl": 0.044495624490082264, "learning_rate": 3.818888888888889e-06, "loss": 0.0023, "num_tokens": 941472.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 57.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.09512516111135483, "kl": 0.08905315399169922, "learning_rate": 3.818333333333334e-06, "loss": 0.0043, "num_tokens": 941807.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 57.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11494667083024979, "kl": 0.0395427942276001, "learning_rate": 3.817777777777778e-06, "loss": 0.002, "num_tokens": 942075.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 57.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.5034273862838745, "kl": 0.10844939574599266, "learning_rate": 3.8172222222222225e-06, "loss": 0.0053, "num_tokens": 942405.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 57.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03401752561330795, "kl": 0.16669857501983643, "learning_rate": 3.816666666666667e-06, "loss": 0.0083, "num_tokens": 942717.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.004935398697853088, "kl": 0.010406427085399628, "learning_rate": 3.816111111111111e-06, "loss": 0.0005, "num_tokens": 942953.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 58.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.601048231124878, "kl": 0.14408687874674797, "learning_rate": 3.8155555555555555e-06, "loss": 0.0377, "num_tokens": 943275.0, "reward": 6.5, "reward_std": 2.345207929611206, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.345207929611206, "step": 3133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 58.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.6625242233276367, "kl": 0.1316450610756874, "learning_rate": 3.815000000000001e-06, "loss": -0.1061, "num_tokens": 943580.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 58.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 1.1880007982254028, "kl": 0.11613204702734947, "learning_rate": 3.8144444444444447e-06, "loss": 0.0053, "num_tokens": 943875.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 58.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.08768118917942047, "kl": 0.04790768213570118, "learning_rate": 3.813888888888889e-06, "loss": 0.0024, "num_tokens": 944145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 58.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.07226842641830444, "kl": 0.02226531133055687, "learning_rate": 3.813333333333334e-06, "loss": 0.0011, "num_tokens": 944436.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 58.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 2.3831872940063477, "kl": 0.13099095970392227, "learning_rate": 3.812777777777778e-06, "loss": -0.0271, "num_tokens": 944797.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 3138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 58.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.10020039230585098, "kl": 0.02837537508457899, "learning_rate": 3.8122222222222225e-06, "loss": 0.0014, "num_tokens": 945089.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 58.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.039807967841625214, "kl": 0.00393866200465709, "learning_rate": 3.811666666666667e-06, "loss": 0.0002, "num_tokens": 945411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 58.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.12738703191280365, "kl": 0.015777386375702918, "learning_rate": 3.8111111111111117e-06, "loss": 0.0008, "num_tokens": 945712.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0471925213932991, "kl": 0.000886775553226471, "learning_rate": 3.8105555555555556e-06, "loss": 0.0, "num_tokens": 945924.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 58.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0095113106071949, "kl": 0.001527806103695184, "learning_rate": 3.8100000000000004e-06, "loss": 0.0001, "num_tokens": 946236.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.04955407232046127, "kl": 0.0286661172285676, "learning_rate": 3.8094444444444443e-06, "loss": 0.0015, "num_tokens": 946455.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 58.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05133502930402756, "kl": 0.019215785898268223, "learning_rate": 3.808888888888889e-06, "loss": 0.0011, "num_tokens": 946726.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 58.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06279648095369339, "kl": 0.014703869819641113, "learning_rate": 3.808333333333334e-06, "loss": 0.0007, "num_tokens": 947031.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 58.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.09856944531202316, "kl": 0.025489541701972485, "learning_rate": 3.8077777777777782e-06, "loss": 0.0013, "num_tokens": 947365.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 58.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06264754384756088, "kl": 0.019393892493098974, "learning_rate": 3.8072222222222226e-06, "loss": 0.001, "num_tokens": 947647.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 58.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 1.9465014934539795, "kl": 0.35805658996105194, "learning_rate": 3.806666666666667e-06, "loss": 0.0607, "num_tokens": 947980.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 58.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.028625834733247757, "kl": 0.12969936802983284, "learning_rate": 3.8061111111111117e-06, "loss": 0.0065, "num_tokens": 948289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 58.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.6499285697937012, "kl": 0.07416116073727608, "learning_rate": 3.8055555555555556e-06, "loss": 0.0037, "num_tokens": 948533.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 58.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.004497079644352198, "kl": 0.0005161914741620421, "learning_rate": 3.8050000000000004e-06, "loss": 0.0, "num_tokens": 948811.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 58.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.20891296863555908, "kl": 0.04240786284208298, "learning_rate": 3.8044444444444443e-06, "loss": 0.0021, "num_tokens": 949091.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 58.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.02294626459479332, "kl": 0.0038864947855472565, "learning_rate": 3.803888888888889e-06, "loss": 0.0002, "num_tokens": 949351.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0511869452893734, "kl": 0.003612323198467493, "learning_rate": 3.803333333333334e-06, "loss": 0.0001, "num_tokens": 949605.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 58.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09536418318748474, "kl": 0.0061325207352638245, "learning_rate": 3.802777777777778e-06, "loss": 0.0003, "num_tokens": 949815.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04881109669804573, "kl": 0.0030634537688456476, "learning_rate": 3.8022222222222226e-06, "loss": 0.0001, "num_tokens": 950034.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 58.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947762280702591, "kl": 0.02187330462038517, "learning_rate": 3.801666666666667e-06, "loss": 0.0011, "num_tokens": 950366.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 58.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.11808343231678009, "kl": 0.03578238561749458, "learning_rate": 3.8011111111111113e-06, "loss": 0.0019, "num_tokens": 950705.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 58.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.3287007808685303, "kl": 0.049397156573832035, "learning_rate": 3.8005555555555557e-06, "loss": 0.0025, "num_tokens": 951045.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 58.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.093181610107422, "kl": 0.24857166036963463, "learning_rate": 3.8000000000000005e-06, "loss": -0.1118, "num_tokens": 951403.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 3161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 58.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.032389480620622635, "kl": 0.004333033226430416, "learning_rate": 3.7994444444444444e-06, "loss": 0.0002, "num_tokens": 951711.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 58.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.5857950448989868, "kl": 0.0675120335072279, "learning_rate": 3.798888888888889e-06, "loss": 0.0035, "num_tokens": 952064.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 58.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.61323881149292, "kl": 0.06667970679700375, "learning_rate": 3.798333333333334e-06, "loss": -0.0481, "num_tokens": 952421.0, "reward": 5.5, "reward_std": 2.309401035308838, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 2.309401035308838, "step": 3164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 58.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.00926903448998928, "kl": 0.015506099909543991, "learning_rate": 3.797777777777778e-06, "loss": 0.0008, "num_tokens": 952733.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 58.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.1212448999285698, "kl": 0.020742583088576794, "learning_rate": 3.7972222222222227e-06, "loss": 0.001, "num_tokens": 952999.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 58.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 1.8678245544433594, "kl": 0.2650231122970581, "learning_rate": 3.796666666666667e-06, "loss": 0.0141, "num_tokens": 953361.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 3167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 58.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.008120649494230747, "kl": 0.0016231834888458252, "learning_rate": 3.7961111111111114e-06, "loss": 0.0001, "num_tokens": 953573.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 58.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.029296085238456726, "kl": 0.007613183464854956, "learning_rate": 3.7955555555555557e-06, "loss": 0.0004, "num_tokens": 953862.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 58.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.022899828851222992, "kl": 0.049504829570651054, "learning_rate": 3.7950000000000005e-06, "loss": 0.0025, "num_tokens": 954319.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 58.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.029216913506388664, "kl": 0.006163960322737694, "learning_rate": 3.7944444444444444e-06, "loss": 0.0003, "num_tokens": 954587.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 58.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.003948436118662357, "kl": 0.0027847588062286377, "learning_rate": 3.7938888888888892e-06, "loss": 0.0001, "num_tokens": 954871.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 58.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.3462793827056885, "kl": 0.0922434851527214, "learning_rate": 3.793333333333334e-06, "loss": 0.0266, "num_tokens": 955196.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.005497423931956291, "kl": 0.010291464626789093, "learning_rate": 3.792777777777778e-06, "loss": 0.0005, "num_tokens": 955432.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 58.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04298729449510574, "kl": 0.2302454337477684, "learning_rate": 3.7922222222222227e-06, "loss": 0.0115, "num_tokens": 955734.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 58.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.07894215732812881, "kl": 0.049816759303212166, "learning_rate": 3.7916666666666666e-06, "loss": 0.0025, "num_tokens": 956075.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 58.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.03844063729047775, "kl": 0.013430262915790081, "learning_rate": 3.7911111111111114e-06, "loss": 0.0007, "num_tokens": 956336.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 58.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.020593393594026566, "kl": 0.00047167239245027304, "learning_rate": 3.7905555555555558e-06, "loss": 0.0, "num_tokens": 956592.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 58.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.000727338541764766, "kl": 0.0017895485507324338, "learning_rate": 3.79e-06, "loss": 0.0001, "num_tokens": 956872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 58.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014305012882687151, "kl": 5.003809928894043e-05, "learning_rate": 3.7894444444444445e-06, "loss": 0.0, "num_tokens": 957092.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 58.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.01930982992053032, "kl": 0.0015588061942253262, "learning_rate": 3.7888888888888893e-06, "loss": 0.0001, "num_tokens": 957327.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 58.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.03789703547954559, "kl": 0.002200112910941243, "learning_rate": 3.788333333333334e-06, "loss": 0.0001, "num_tokens": 957589.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 58.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03491241857409477, "kl": 0.032619014382362366, "learning_rate": 3.787777777777778e-06, "loss": 0.0016, "num_tokens": 957907.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 58.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.5899437069892883, "kl": 0.06877947971224785, "learning_rate": 3.7872222222222228e-06, "loss": 0.0034, "num_tokens": 958174.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 58.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.050050102174282074, "kl": 0.01080898754298687, "learning_rate": 3.7866666666666667e-06, "loss": 0.0005, "num_tokens": 958446.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.014035825617611408, "kl": 0.0013493557489709929, "learning_rate": 3.7861111111111115e-06, "loss": 0.0001, "num_tokens": 958719.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 59.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04550528526306152, "kl": 0.024684779345989227, "learning_rate": 3.785555555555556e-06, "loss": 0.0012, "num_tokens": 958993.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 59.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.004979588557034731, "kl": 0.0029082244727760553, "learning_rate": 3.785e-06, "loss": 0.0001, "num_tokens": 959289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 59.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.14971674978733063, "kl": 0.03942887578159571, "learning_rate": 3.784444444444445e-06, "loss": 0.002, "num_tokens": 959615.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 59.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.008265774697065353, "kl": 0.01687889639288187, "learning_rate": 3.7838888888888893e-06, "loss": 0.0008, "num_tokens": 959875.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 59.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.12076694518327713, "kl": 0.025311589241027832, "learning_rate": 3.7833333333333337e-06, "loss": 0.0013, "num_tokens": 960214.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 59.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.031195644289255142, "kl": 0.03557034209370613, "learning_rate": 3.782777777777778e-06, "loss": 0.0018, "num_tokens": 960542.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 59.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.054643187671899796, "kl": 0.020760266110301018, "learning_rate": 3.782222222222223e-06, "loss": 0.001, "num_tokens": 960833.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 59.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041557736694812775, "kl": 0.09004123136401176, "learning_rate": 3.7816666666666667e-06, "loss": 0.0045, "num_tokens": 961197.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 59.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.03676632419228554, "kl": 0.0030144639313220978, "learning_rate": 3.7811111111111115e-06, "loss": 0.0001, "num_tokens": 961453.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 59.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.021765077486634254, "kl": 0.0014143344014883041, "learning_rate": 3.7805555555555555e-06, "loss": 0.0001, "num_tokens": 961688.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 59.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09133893251419067, "kl": 0.03034512046724558, "learning_rate": 3.7800000000000002e-06, "loss": 0.0015, "num_tokens": 962024.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 59.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.2171374261379242, "kl": 0.05778507888317108, "learning_rate": 3.779444444444445e-06, "loss": 0.0029, "num_tokens": 962333.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 59.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.012341918423771858, "kl": 0.00371420755982399, "learning_rate": 3.7788888888888894e-06, "loss": 0.0002, "num_tokens": 962593.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 59.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 12.168850898742676, "kl": 0.02914503403007984, "learning_rate": 3.7783333333333337e-06, "loss": 0.244, "num_tokens": 962816.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 59.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.007826860062777996, "kl": 0.001732170581817627, "learning_rate": 3.777777777777778e-06, "loss": 0.0001, "num_tokens": 963028.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 59.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.033856380730867386, "kl": 0.009959203889593482, "learning_rate": 3.777222222222223e-06, "loss": 0.0005, "num_tokens": 963316.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 59.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.6577698588371277, "kl": 0.14378930814564228, "learning_rate": 3.776666666666667e-06, "loss": 0.0073, "num_tokens": 963660.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 59.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.053747184574604034, "kl": 0.001439103449229151, "learning_rate": 3.7761111111111116e-06, "loss": 0.0001, "num_tokens": 963878.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 59.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.16622185707092285, "kl": 0.183536097407341, "learning_rate": 3.7755555555555555e-06, "loss": 0.0092, "num_tokens": 964187.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 59.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.022980159148573875, "kl": 0.0004116833151783794, "learning_rate": 3.7750000000000003e-06, "loss": 0.0, "num_tokens": 964400.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 59.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.009350917302072048, "kl": 0.015427803620696068, "learning_rate": 3.774444444444445e-06, "loss": 0.0008, "num_tokens": 964712.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 59.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014495840878225863, "kl": 5.0358474254608154e-05, "learning_rate": 3.773888888888889e-06, "loss": 0.0, "num_tokens": 964932.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 59.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.5174953937530518, "kl": 0.1348112877458334, "learning_rate": 3.7733333333333338e-06, "loss": 0.0068, "num_tokens": 965248.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 59.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.15221500396728516, "kl": 0.030050507397390902, "learning_rate": 3.772777777777778e-06, "loss": 0.0016, "num_tokens": 965533.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 59.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.23556825518608093, "kl": 0.07430951669812202, "learning_rate": 3.7722222222222225e-06, "loss": 0.0038, "num_tokens": 965886.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 59.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.32121506333351135, "kl": 0.03672574367374182, "learning_rate": 3.771666666666667e-06, "loss": 0.0018, "num_tokens": 966150.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 59.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.756197452545166, "kl": 0.09502818435430527, "learning_rate": 3.7711111111111116e-06, "loss": 0.0369, "num_tokens": 966508.0, "reward": 5.5, "reward_std": 2.309401035308838, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 2.309401035308838, "step": 3213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 59.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 7.377974987030029, "kl": 0.9080493003129959, "learning_rate": 3.7705555555555555e-06, "loss": 0.0464, "num_tokens": 966813.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 59.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.9420723915100098, "kl": 0.35599467158317566, "learning_rate": 3.7700000000000003e-06, "loss": 0.0179, "num_tokens": 967144.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 59.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.027185998857021332, "kl": 0.0027852430939674377, "learning_rate": 3.769444444444445e-06, "loss": 0.0001, "num_tokens": 967470.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.006216807756572962, "kl": 0.00017154053784906864, "learning_rate": 3.768888888888889e-06, "loss": 0.0, "num_tokens": 967727.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 59.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.06031180918216705, "kl": 0.019859046675264835, "learning_rate": 3.768333333333334e-06, "loss": 0.001, "num_tokens": 968047.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011627906933426857, "clip_ratio/low_min": 0.011627906933426857, "clip_ratio/region_mean": 0.011627906933426857, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 59.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 8.823089599609375, "kl": 0.10902068763971329, "learning_rate": 3.767777777777778e-06, "loss": 0.0385, "num_tokens": 968358.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 59.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.019669203087687492, "kl": 0.05088961310684681, "learning_rate": 3.7672222222222225e-06, "loss": 0.0025, "num_tokens": 968810.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 59.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.13287459313869476, "kl": 0.0072037228383123875, "learning_rate": 3.766666666666667e-06, "loss": 0.0004, "num_tokens": 969021.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 59.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.005549197550863028, "kl": 0.0020015843911096454, "learning_rate": 3.7661111111111117e-06, "loss": 0.0001, "num_tokens": 969298.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 59.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.02036476694047451, "kl": 0.008827347308397293, "learning_rate": 3.7655555555555556e-06, "loss": 0.0004, "num_tokens": 969584.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 59.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.008798274211585522, "kl": 0.0012108153896406293, "learning_rate": 3.7650000000000004e-06, "loss": 0.0001, "num_tokens": 969896.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 59.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0066185821779072285, "kl": 0.00022835603158455342, "learning_rate": 3.764444444444445e-06, "loss": 0.0, "num_tokens": 970176.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 59.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07407813519239426, "kl": 0.08094332739710808, "learning_rate": 3.763888888888889e-06, "loss": 0.004, "num_tokens": 970533.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 59.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 5.232766151428223, "kl": 0.14131219685077667, "learning_rate": 3.763333333333334e-06, "loss": 0.0216, "num_tokens": 970849.0, "reward": 2.0, "reward_std": 2.4494898319244385, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 2.4494898319244385, "step": 3227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 59.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0224266666918993, "kl": 0.004253081977367401, "learning_rate": 3.762777777777778e-06, "loss": 0.0002, "num_tokens": 971151.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.005453574936836958, "kl": 0.001122650457546115, "learning_rate": 3.7622222222222226e-06, "loss": 0.0001, "num_tokens": 971411.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 59.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.09742196649312973, "kl": 0.018032620195299387, "learning_rate": 3.761666666666667e-06, "loss": 0.0009, "num_tokens": 971710.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 59.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.07459329813718796, "kl": 0.021896670572459698, "learning_rate": 3.7611111111111113e-06, "loss": 0.0012, "num_tokens": 971999.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 59.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.02855066768825054, "kl": 0.00464949756860733, "learning_rate": 3.7605555555555556e-06, "loss": 0.0002, "num_tokens": 972267.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 75.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 59.870370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 2.116550922393799, "kl": 0.04569835960865021, "learning_rate": 3.7600000000000004e-06, "loss": 0.4561, "num_tokens": 972790.0, "reward": 5.675000190734863, "reward_std": 3.6499998569488525, "rewards/reward_combined/mean": 5.675000190734863, "rewards/reward_combined/std": 3.6500000953674316, "step": 3233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 59.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 7.098142147064209, "kl": 0.008306290954351425, "learning_rate": 3.759444444444445e-06, "loss": 0.1003, "num_tokens": 973038.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 59.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058235423639416695, "kl": 0.010162971913814545, "learning_rate": 3.758888888888889e-06, "loss": 0.0005, "num_tokens": 973274.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.2796732187271118, "kl": 0.037171732634305954, "learning_rate": 3.758333333333334e-06, "loss": 0.002, "num_tokens": 973544.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 59.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03942904248833656, "kl": 0.03028384130448103, "learning_rate": 3.757777777777778e-06, "loss": 0.0015, "num_tokens": 973812.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 59.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.018507888540625572, "kl": 0.008084533736109734, "learning_rate": 3.7572222222222226e-06, "loss": 0.0004, "num_tokens": 974086.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 59.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.025471247732639313, "kl": 0.0026877362579398323, "learning_rate": 3.756666666666667e-06, "loss": 0.0001, "num_tokens": 974356.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 60.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09317994862794876, "kl": 0.006539231515489519, "learning_rate": 3.7561111111111113e-06, "loss": 0.0003, "num_tokens": 974678.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 60.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.40341946482658386, "kl": 0.18226273357868195, "learning_rate": 3.7555555555555557e-06, "loss": 0.009, "num_tokens": 974992.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 60.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.004980140831321478, "kl": 0.0007633611676283181, "learning_rate": 3.7550000000000005e-06, "loss": 0.0, "num_tokens": 975254.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 60.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01074548251926899, "kl": 0.001347184181213379, "learning_rate": 3.754444444444445e-06, "loss": 0.0001, "num_tokens": 975466.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 60.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.09402364492416382, "kl": 0.05571155250072479, "learning_rate": 3.753888888888889e-06, "loss": 0.0028, "num_tokens": 975912.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 4.512792110443115, "kl": 0.03698847535997629, "learning_rate": 3.753333333333334e-06, "loss": -0.1036, "num_tokens": 976191.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 60.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.065815269947052, "kl": 0.004285166505724192, "learning_rate": 3.752777777777778e-06, "loss": 0.0002, "num_tokens": 976467.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 60.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.7814235687255859, "kl": 0.33185581862926483, "learning_rate": 3.7522222222222227e-06, "loss": 0.0166, "num_tokens": 976771.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 60.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.13256508111953735, "kl": 0.03055038396269083, "learning_rate": 3.7516666666666666e-06, "loss": 0.0015, "num_tokens": 977091.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 60.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.005830288399010897, "kl": 0.010152161121368408, "learning_rate": 3.7511111111111114e-06, "loss": 0.0005, "num_tokens": 977327.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 60.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.020344045013189316, "kl": 0.0016653947532176971, "learning_rate": 3.7505555555555557e-06, "loss": 0.0001, "num_tokens": 977571.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05162215232849121, "kl": 0.01669926382601261, "learning_rate": 3.7500000000000005e-06, "loss": 0.0009, "num_tokens": 977861.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 60.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.055804502218961716, "kl": 0.05309124104678631, "learning_rate": 3.749444444444445e-06, "loss": 0.002, "num_tokens": 978188.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 60.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.2731945812702179, "kl": 0.031898993998765945, "learning_rate": 3.7488888888888892e-06, "loss": 0.0016, "num_tokens": 978448.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 60.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.005675759166479111, "kl": 0.0014511466142721474, "learning_rate": 3.748333333333334e-06, "loss": 0.0001, "num_tokens": 978667.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 60.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.00721801957115531, "kl": 0.0002040361287072301, "learning_rate": 3.747777777777778e-06, "loss": 0.0, "num_tokens": 978935.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 60.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.015203535556793213, "kl": 0.029377058148384094, "learning_rate": 3.7472222222222227e-06, "loss": 0.0015, "num_tokens": 979151.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 60.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.047480590641498566, "kl": 0.010568130761384964, "learning_rate": 3.7466666666666667e-06, "loss": 0.0005, "num_tokens": 979444.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 60.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.04198266193270683, "kl": 0.010981231927871704, "learning_rate": 3.7461111111111114e-06, "loss": 0.0005, "num_tokens": 979716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 60.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.027004431933164597, "kl": 0.004668138455599546, "learning_rate": 3.7455555555555558e-06, "loss": 0.0002, "num_tokens": 979986.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 60.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.020478632301092148, "kl": 0.00038139818934723735, "learning_rate": 3.745e-06, "loss": 0.0, "num_tokens": 980199.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 60.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.06053868681192398, "kl": 0.03075536247342825, "learning_rate": 3.744444444444445e-06, "loss": 0.0015, "num_tokens": 980467.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 60.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.02266186848282814, "kl": 0.16849920898675919, "learning_rate": 3.7438888888888893e-06, "loss": 0.0084, "num_tokens": 980775.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 60.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 1.903356671333313, "kl": 0.04130503535270691, "learning_rate": 3.7433333333333336e-06, "loss": 0.0156, "num_tokens": 981131.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 60.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11785285174846649, "kl": 0.0118315692525357, "learning_rate": 3.742777777777778e-06, "loss": 0.0005, "num_tokens": 981398.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 60.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01318493951112032, "kl": 0.015493812970817089, "learning_rate": 3.7422222222222228e-06, "loss": 0.0008, "num_tokens": 981658.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 60.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08304750919342041, "kl": 0.08064145222306252, "learning_rate": 3.7416666666666667e-06, "loss": 0.004, "num_tokens": 982029.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 60.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04882539436221123, "kl": 0.016939186491072178, "learning_rate": 3.7411111111111115e-06, "loss": 0.0008, "num_tokens": 982289.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0504462830722332, "kl": 0.011092187836766243, "learning_rate": 3.7405555555555554e-06, "loss": 0.0006, "num_tokens": 982571.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 60.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.4876723289489746, "kl": 0.25323865562677383, "learning_rate": 3.74e-06, "loss": 0.0061, "num_tokens": 982908.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 3269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 60.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014441304665524513, "kl": 4.6350061893463135e-05, "learning_rate": 3.739444444444445e-06, "loss": 0.0, "num_tokens": 983128.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 60.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.13004940748214722, "kl": 0.0377192422747612, "learning_rate": 3.7388888888888893e-06, "loss": 0.0018, "num_tokens": 983419.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 60.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 3.6001315116882324, "kl": 0.010534189874306321, "learning_rate": 3.7383333333333337e-06, "loss": 0.0198, "num_tokens": 983741.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.004810973536223173, "kl": 0.002817286876961589, "learning_rate": 3.737777777777778e-06, "loss": 0.0001, "num_tokens": 984025.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 60.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009529200615361333, "kl": 0.001880386145785451, "learning_rate": 3.737222222222223e-06, "loss": 0.0001, "num_tokens": 984302.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 60.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 2.400193691253662, "kl": 0.06805730238556862, "learning_rate": 3.7366666666666667e-06, "loss": 0.0037, "num_tokens": 984655.0, "reward": 4.25, "reward_std": 3.752776622772217, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 3.752776861190796, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 60.666666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 2.4768009185791016, "kl": 0.015001873485744, "learning_rate": 3.7361111111111115e-06, "loss": 0.0015, "num_tokens": 984987.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.1622043251991272, "kl": 0.05193705204874277, "learning_rate": 3.7355555555555555e-06, "loss": 0.0027, "num_tokens": 985259.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 60.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015020148130133748, "kl": 0.00017296969599556178, "learning_rate": 3.7350000000000002e-06, "loss": 0.0, "num_tokens": 985515.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 60.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 4.728142261505127, "kl": 0.0821068100631237, "learning_rate": 3.734444444444445e-06, "loss": 0.2383, "num_tokens": 985853.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 3279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 60.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.08026643842458725, "kl": 0.004448927938938141, "learning_rate": 3.733888888888889e-06, "loss": 0.0002, "num_tokens": 986061.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 60.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.07348132878541946, "kl": 0.015018029138445854, "learning_rate": 3.7333333333333337e-06, "loss": 0.0008, "num_tokens": 986361.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 60.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.02002396248281002, "kl": 0.0013832662225468084, "learning_rate": 3.732777777777778e-06, "loss": 0.0001, "num_tokens": 986595.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 60.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02936232089996338, "kl": 0.03586737439036369, "learning_rate": 3.7322222222222224e-06, "loss": 0.0018, "num_tokens": 986932.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 60.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.004118500743061304, "kl": 0.0008157312695402652, "learning_rate": 3.731666666666667e-06, "loss": 0.0, "num_tokens": 987192.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 60.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 3.4936559200286865, "kl": 0.10406386852264404, "learning_rate": 3.7311111111111116e-06, "loss": -0.0014, "num_tokens": 987548.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 60.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.431903213262558, "kl": 0.04787010047584772, "learning_rate": 3.7305555555555555e-06, "loss": 0.0025, "num_tokens": 987886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 60.870370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 2.3120410442352295, "kl": 0.9300144985318184, "learning_rate": 3.7300000000000003e-06, "loss": 0.0437, "num_tokens": 988250.0, "reward": 2.25, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.4433757066726685, "step": 3287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 60.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.08580324798822403, "kl": 0.013487997930496931, "learning_rate": 3.729444444444445e-06, "loss": 0.0007, "num_tokens": 988541.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 60.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.010733827017247677, "kl": 0.014830505475401878, "learning_rate": 3.728888888888889e-06, "loss": 0.0007, "num_tokens": 988853.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 60.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.030742855742573738, "kl": 0.0031587405828759074, "learning_rate": 3.7283333333333338e-06, "loss": 0.0002, "num_tokens": 989176.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 60.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04996855929493904, "kl": 0.007175315520726144, "learning_rate": 3.727777777777778e-06, "loss": 0.0004, "num_tokens": 989474.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.013888888992369175, "clip_ratio/region_mean": 0.013888888992369175, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 60.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.2042598724365234, "kl": 0.1386697255074978, "learning_rate": 3.7272222222222225e-06, "loss": -0.0462, "num_tokens": 989770.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3292 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.02777777798473835, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02777777798473835, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 60.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.902221202850342, "kl": 0.17106035351753235, "learning_rate": 3.726666666666667e-06, "loss": 0.0259, "num_tokens": 990098.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 61.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.019770652055740356, "kl": 0.002191289677284658, "learning_rate": 3.7261111111111116e-06, "loss": 0.0001, "num_tokens": 990411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.75, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 61.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.7242045402526855, "kl": 0.22408606112003326, "learning_rate": 3.7255555555555556e-06, "loss": 0.2707, "num_tokens": 990802.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.5, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 61.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.3951735496521, "kl": 0.021977580152451992, "learning_rate": 3.7250000000000003e-06, "loss": 0.4952, "num_tokens": 991216.0, "reward": 4.875, "reward_std": 3.1983067989349365, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.1983067989349365, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 61.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06638296693563461, "kl": 0.005190499126911163, "learning_rate": 3.724444444444445e-06, "loss": 0.0003, "num_tokens": 991476.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 61.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.03596128150820732, "kl": 0.011840321123600006, "learning_rate": 3.723888888888889e-06, "loss": 0.0006, "num_tokens": 991749.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 61.092592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.5089824199676514, "kl": 0.45400248374789953, "learning_rate": 3.723333333333334e-06, "loss": 0.0752, "num_tokens": 992038.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 61.111111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 1.0588181018829346, "kl": 0.12414070591330528, "learning_rate": 3.7227777777777778e-06, "loss": 0.0284, "num_tokens": 992379.0, "reward": 3.125, "reward_std": 3.1983067989349365, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 3.1983067989349365, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 61.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455719769001007, "kl": 0.021848680218681693, "learning_rate": 3.7222222222222225e-06, "loss": 0.0011, "num_tokens": 992679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 61.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.011796163395047188, "kl": 0.2141786515712738, "learning_rate": 3.721666666666667e-06, "loss": 0.0107, "num_tokens": 992983.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 61.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.056167930364608765, "kl": 0.013514367397874594, "learning_rate": 3.7211111111111112e-06, "loss": 0.0007, "num_tokens": 993289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.004132524598389864, "kl": 0.001027739024721086, "learning_rate": 3.7205555555555556e-06, "loss": 0.0, "num_tokens": 993543.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 61.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11384717375040054, "kl": 0.003448988078162074, "learning_rate": 3.7200000000000004e-06, "loss": 0.0002, "num_tokens": 993759.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.005105254705995321, "kl": 0.00046487152576446533, "learning_rate": 3.719444444444445e-06, "loss": 0.0, "num_tokens": 993978.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 61.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.8129680156707764, "kl": 0.12600144930183887, "learning_rate": 3.718888888888889e-06, "loss": 0.1086, "num_tokens": 994256.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 61.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.02757744863629341, "kl": 0.001619843766093254, "learning_rate": 3.718333333333334e-06, "loss": 0.0001, "num_tokens": 994534.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 61.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.020858149975538254, "kl": 0.0016996709746308625, "learning_rate": 3.717777777777778e-06, "loss": 0.0001, "num_tokens": 994851.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 61.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.005861029494553804, "kl": 0.00018483996245777234, "learning_rate": 3.7172222222222226e-06, "loss": 0.0, "num_tokens": 995107.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 61.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.04316964000463486, "kl": 0.010662832297384739, "learning_rate": 3.716666666666667e-06, "loss": 0.0005, "num_tokens": 995387.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 61.333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 3.5575878620147705, "kl": 0.08993897214531898, "learning_rate": 3.7161111111111113e-06, "loss": 0.0414, "num_tokens": 995720.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.08233688026666641, "kl": 0.03172195143997669, "learning_rate": 3.7155555555555557e-06, "loss": 0.0016, "num_tokens": 995939.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 61.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.05725571885704994, "kl": 0.0033767768181860447, "learning_rate": 3.7150000000000004e-06, "loss": 0.0002, "num_tokens": 996172.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 61.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.5723376870155334, "kl": 0.11518320441246033, "learning_rate": 3.7144444444444448e-06, "loss": 0.0058, "num_tokens": 996545.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 61.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0509815476834774, "kl": 0.04415077343583107, "learning_rate": 3.713888888888889e-06, "loss": 0.0023, "num_tokens": 996815.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 61.425925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 2.8790910243988037, "kl": 0.12316984869539738, "learning_rate": 3.713333333333334e-06, "loss": 0.0557, "num_tokens": 997165.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 3317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 61.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.062445707619190216, "kl": 0.0073662897339090705, "learning_rate": 3.712777777777778e-06, "loss": 0.0004, "num_tokens": 997477.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 61.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01193488109856844, "kl": 0.0022399425506591797, "learning_rate": 3.7122222222222226e-06, "loss": 0.0001, "num_tokens": 997751.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001605776051292196, "kl": 3.626197576522827e-05, "learning_rate": 3.7116666666666666e-06, "loss": 0.0, "num_tokens": 997971.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 61.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.8463218212127686, "kl": 0.07704522274434566, "learning_rate": 3.7111111111111113e-06, "loss": 0.0735, "num_tokens": 998305.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 3321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 61.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009752177633345127, "kl": 0.01509946584701538, "learning_rate": 3.710555555555556e-06, "loss": 0.0008, "num_tokens": 998617.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.021578026935458183, "kl": 0.000486813485622406, "learning_rate": 3.7100000000000005e-06, "loss": 0.0, "num_tokens": 998829.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 61.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03156450390815735, "kl": 0.010453439317643642, "learning_rate": 3.709444444444445e-06, "loss": 0.0006, "num_tokens": 999099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 61.574074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 3.7581279277801514, "kl": 0.0590220782905817, "learning_rate": 3.708888888888889e-06, "loss": -0.0061, "num_tokens": 999401.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 61.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.003659159643575549, "kl": 0.010605446994304657, "learning_rate": 3.708333333333334e-06, "loss": 0.0005, "num_tokens": 999637.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 61.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.20379482209682465, "kl": 0.036088400054723024, "learning_rate": 3.707777777777778e-06, "loss": 0.0018, "num_tokens": 999919.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 61.629629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 6.088356971740723, "kl": 0.10260043293237686, "learning_rate": 3.7072222222222227e-06, "loss": 0.1752, "num_tokens": 1000253.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 61.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.1074448972940445, "kl": 0.04104287549853325, "learning_rate": 3.7066666666666666e-06, "loss": 0.002, "num_tokens": 1000589.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 61.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.012157675810158253, "kl": 0.015738967806100845, "learning_rate": 3.7061111111111114e-06, "loss": 0.0008, "num_tokens": 1000849.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 61.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.03734297677874565, "kl": 0.026258178986608982, "learning_rate": 3.705555555555556e-06, "loss": 0.0013, "num_tokens": 1001181.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 61.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.6229716539382935, "kl": 0.13978800922632217, "learning_rate": 3.705e-06, "loss": 0.0069, "num_tokens": 1001500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 61.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.031670331954956055, "kl": 0.1651373952627182, "learning_rate": 3.704444444444445e-06, "loss": 0.0083, "num_tokens": 1001810.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 61.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.020541144534945488, "kl": 0.0028789745992980897, "learning_rate": 3.7038888888888892e-06, "loss": 0.0002, "num_tokens": 1002100.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 61.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.023621194064617157, "kl": 0.0011950001353397965, "learning_rate": 3.7033333333333336e-06, "loss": 0.0001, "num_tokens": 1002362.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 61.77777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.008141793310642242, "kl": 0.00040233382605947554, "learning_rate": 3.702777777777778e-06, "loss": 0.0, "num_tokens": 1002634.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 61.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.24025453627109528, "kl": 0.026112884283065796, "learning_rate": 3.7022222222222227e-06, "loss": 0.0016, "num_tokens": 1002883.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 61.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.041742075234651566, "kl": 0.05328580550849438, "learning_rate": 3.7016666666666667e-06, "loss": 0.0027, "num_tokens": 1003336.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 61.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 0.05111464485526085, "kl": 0.36191771551966667, "learning_rate": 3.7011111111111114e-06, "loss": 0.0182, "num_tokens": 1003700.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 3339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 61.851851851851855, "frac_reward_zero_std": 0.0, "grad_norm": 4.487477779388428, "kl": 0.1925869956612587, "learning_rate": 3.7005555555555562e-06, "loss": 0.0628, "num_tokens": 1004026.0, "reward": 4.375, "reward_std": 4.75, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.75, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 78.25, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 78.25, "completions/mean_terminated_length": 78.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 61.870370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 1.657657265663147, "kl": 0.15048764646053314, "learning_rate": 3.7e-06, "loss": 0.0878, "num_tokens": 1004571.0, "reward": 6.925000190734863, "reward_std": 1.3500001430511475, "rewards/reward_combined/mean": 6.925000190734863, "rewards/reward_combined/std": 1.3500001430511475, "step": 3341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 61.888888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.003344438038766384, "kl": 0.0030989539809525013, "learning_rate": 3.699444444444445e-06, "loss": 0.0002, "num_tokens": 1004867.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 61.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.060032933950424194, "kl": 0.029442924074828625, "learning_rate": 3.6988888888888893e-06, "loss": 0.0014, "num_tokens": 1005199.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 61.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.024470604956150055, "kl": 0.006174231180921197, "learning_rate": 3.6983333333333336e-06, "loss": 0.0003, "num_tokens": 1005531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 61.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.051238469779491425, "kl": 0.018045068252831697, "learning_rate": 3.697777777777778e-06, "loss": 0.0009, "num_tokens": 1005902.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 61.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.044744864106178284, "kl": 0.009374113287776709, "learning_rate": 3.6972222222222228e-06, "loss": 0.0005, "num_tokens": 1006170.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 61.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 1.7680118083953857, "kl": 0.11680825427174568, "learning_rate": 3.6966666666666667e-06, "loss": 0.007, "num_tokens": 1006436.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 62.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04189912974834442, "kl": 0.0038198083639144897, "learning_rate": 3.6961111111111115e-06, "loss": 0.0002, "num_tokens": 1006642.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 62.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.044485822319984436, "kl": 0.009181688539683819, "learning_rate": 3.6955555555555563e-06, "loss": 0.0005, "num_tokens": 1006914.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 62.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.033510781824588776, "kl": 0.02806740114465356, "learning_rate": 3.695e-06, "loss": 0.0013, "num_tokens": 1007246.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 62.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.006065318360924721, "kl": 0.00029621273279190063, "learning_rate": 3.694444444444445e-06, "loss": 0.0, "num_tokens": 1007458.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 62.074074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.05755128711462021, "kl": 0.002325886453036219, "learning_rate": 3.693888888888889e-06, "loss": 0.0001, "num_tokens": 1007726.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 62.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.05006115138530731, "kl": 0.03155629336833954, "learning_rate": 3.6933333333333337e-06, "loss": 0.0016, "num_tokens": 1007942.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 62.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.2417583018541336, "kl": 0.04533923137933016, "learning_rate": 3.692777777777778e-06, "loss": 0.0023, "num_tokens": 1008245.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 62.129629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.038089267909526825, "kl": 0.0017151072970591486, "learning_rate": 3.6922222222222224e-06, "loss": 0.0001, "num_tokens": 1008464.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 62.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.32644084095954895, "kl": 0.17947406321763992, "learning_rate": 3.6916666666666668e-06, "loss": 0.0085, "num_tokens": 1008763.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.166666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 4.244999885559082, "kl": 0.024954683845862746, "learning_rate": 3.6911111111111115e-06, "loss": 0.0991, "num_tokens": 1009053.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 62.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0380619578063488, "kl": 0.04241488128900528, "learning_rate": 3.6905555555555563e-06, "loss": 0.0021, "num_tokens": 1009381.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 62.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.007090247236192226, "kl": 0.0012280881055630744, "learning_rate": 3.6900000000000002e-06, "loss": 0.0001, "num_tokens": 1009641.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.05415713042020798, "kl": 0.005235703662037849, "learning_rate": 3.689444444444445e-06, "loss": 0.0003, "num_tokens": 1009937.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 62.24074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673973485827446, "kl": 0.014206718653440475, "learning_rate": 3.688888888888889e-06, "loss": 0.0007, "num_tokens": 1010240.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 62.25925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.12474815547466278, "kl": 0.03507429268211126, "learning_rate": 3.6883333333333337e-06, "loss": 0.0017, "num_tokens": 1010570.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 62.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.010035203769803047, "kl": 0.002931106835603714, "learning_rate": 3.687777777777778e-06, "loss": 0.0001, "num_tokens": 1010830.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 62.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0379633791744709, "kl": 0.0033033699728548527, "learning_rate": 3.6872222222222224e-06, "loss": 0.0002, "num_tokens": 1011144.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 62.31481481481482, "frac_reward_zero_std": 0.0, "grad_norm": 1.3307594060897827, "kl": 0.031008930876851082, "learning_rate": 3.686666666666667e-06, "loss": -0.0037, "num_tokens": 1011536.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.04222875088453293, "kl": 0.028961047530174255, "learning_rate": 3.6861111111111116e-06, "loss": 0.0014, "num_tokens": 1011820.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 62.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.012905299663543701, "kl": 0.0015537950675934553, "learning_rate": 3.685555555555556e-06, "loss": 0.0001, "num_tokens": 1012108.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 62.370370370370374, "frac_reward_zero_std": 0.0, "grad_norm": 2.1337125301361084, "kl": 0.046071136835962534, "learning_rate": 3.6850000000000003e-06, "loss": 0.0391, "num_tokens": 1012382.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 62.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 2.0079236030578613, "kl": 0.2971185212954879, "learning_rate": 3.684444444444445e-06, "loss": 0.0149, "num_tokens": 1012684.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 62.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.03355126082897186, "kl": 0.0302305705845356, "learning_rate": 3.683888888888889e-06, "loss": 0.0015, "num_tokens": 1012952.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.025436008349061012, "kl": 0.013157148379832506, "learning_rate": 3.6833333333333338e-06, "loss": 0.0007, "num_tokens": 1013232.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 62.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.011237611062824726, "kl": 0.01593383215367794, "learning_rate": 3.6827777777777777e-06, "loss": 0.0008, "num_tokens": 1013492.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 62.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.19951048493385315, "kl": 0.029844471719115973, "learning_rate": 3.6822222222222225e-06, "loss": 0.0014, "num_tokens": 1013766.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 62.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07336229085922241, "kl": 0.024773717625066638, "learning_rate": 3.681666666666667e-06, "loss": 0.001, "num_tokens": 1014085.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 62.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05782890319824219, "kl": 0.013115547131747007, "learning_rate": 3.681111111111111e-06, "loss": 0.0007, "num_tokens": 1014404.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 62.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.9083445072174072, "kl": 0.14098144322633743, "learning_rate": 3.680555555555556e-06, "loss": -0.024, "num_tokens": 1014711.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.5, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 62.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.6620241403579712, "kl": 0.04606764763593674, "learning_rate": 3.6800000000000003e-06, "loss": 0.0883, "num_tokens": 1015185.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 3377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 62.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.9765515327453613, "kl": 0.04677324369549751, "learning_rate": 3.679444444444445e-06, "loss": 0.0311, "num_tokens": 1015540.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 62.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.028933420777320862, "kl": 0.23955227434635162, "learning_rate": 3.678888888888889e-06, "loss": 0.0119, "num_tokens": 1015840.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 62.592592592592595, "frac_reward_zero_std": 0.0, "grad_norm": 2.7819201946258545, "kl": 0.09132722392678261, "learning_rate": 3.678333333333334e-06, "loss": -0.2529, "num_tokens": 1016225.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 62.611111111111114, "frac_reward_zero_std": 0.0, "grad_norm": 2.832656145095825, "kl": 0.15064430236816406, "learning_rate": 3.6777777777777778e-06, "loss": -0.1081, "num_tokens": 1016604.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 3381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 62.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.03550001606345177, "kl": 0.001497817866038531, "learning_rate": 3.6772222222222225e-06, "loss": 0.0001, "num_tokens": 1016838.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3382 }, { "clip_ratio/high_max": 0.0833333358168602, "clip_ratio/high_mean": 0.0833333358168602, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0833333358168602, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 62.648148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 14.809858322143555, "kl": 0.18098166584968567, "learning_rate": 3.676666666666667e-06, "loss": 0.1437, "num_tokens": 1017051.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 3383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 62.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.14115338027477264, "kl": 0.04560870770365, "learning_rate": 3.6761111111111113e-06, "loss": 0.0025, "num_tokens": 1017392.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 62.68518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.00457087391987443, "kl": 0.00020851791487075388, "learning_rate": 3.675555555555556e-06, "loss": 0.0, "num_tokens": 1017648.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 62.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.422107219696045, "kl": 0.2008829414844513, "learning_rate": 3.6750000000000004e-06, "loss": -0.0022, "num_tokens": 1017925.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 62.72222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 0.9819877743721008, "kl": 0.02000294253230095, "learning_rate": 3.6744444444444447e-06, "loss": 0.0153, "num_tokens": 1018238.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 3387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 62.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.000153022599988617, "kl": 4.08366322517395e-05, "learning_rate": 3.673888888888889e-06, "loss": 0.0, "num_tokens": 1018458.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 62.75925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.997636556625366, "kl": 0.030708379112184048, "learning_rate": 3.673333333333334e-06, "loss": -0.0284, "num_tokens": 1018715.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 62.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.444230794906616, "kl": 0.11033226177096367, "learning_rate": 3.672777777777778e-06, "loss": -0.0346, "num_tokens": 1019080.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 62.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.005627889651805162, "kl": 0.0011415411718189716, "learning_rate": 3.6722222222222226e-06, "loss": 0.0001, "num_tokens": 1019400.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 62.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.16572798788547516, "kl": 0.01191854802891612, "learning_rate": 3.6716666666666665e-06, "loss": 0.0006, "num_tokens": 1019665.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 62.833333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 2.830596446990967, "kl": 0.04812158830463886, "learning_rate": 3.6711111111111113e-06, "loss": -0.007, "num_tokens": 1019976.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 62.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.22114554047584534, "kl": 0.02216158900409937, "learning_rate": 3.670555555555556e-06, "loss": 0.0012, "num_tokens": 1020243.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 62.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.20952539145946503, "kl": 0.05244244821369648, "learning_rate": 3.6700000000000004e-06, "loss": 0.0026, "num_tokens": 1020534.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 62.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 3.3135416507720947, "kl": 0.2092377245426178, "learning_rate": 3.669444444444445e-06, "loss": 0.0249, "num_tokens": 1020866.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 62.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.01576932892203331, "kl": 0.0017602331936359406, "learning_rate": 3.668888888888889e-06, "loss": 0.0001, "num_tokens": 1021110.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 62.925925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.02949345111846924, "kl": 0.003024222096428275, "learning_rate": 3.668333333333334e-06, "loss": 0.0002, "num_tokens": 1021392.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 62.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.007548513822257519, "kl": 0.0016490072011947632, "learning_rate": 3.667777777777778e-06, "loss": 0.0001, "num_tokens": 1021604.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 62.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033867305610328913, "kl": 0.01063515990972519, "learning_rate": 3.6672222222222226e-06, "loss": 0.0005, "num_tokens": 1021840.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 62.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.232091903686523, "kl": 0.13629882968962193, "learning_rate": 3.6666666666666666e-06, "loss": -0.0103, "num_tokens": 1022162.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 63.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.8908512592315674, "kl": 0.02209263585973531, "learning_rate": 3.6661111111111114e-06, "loss": -0.0317, "num_tokens": 1022442.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 63.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.29252815246582, "kl": 0.15114304423332214, "learning_rate": 3.665555555555556e-06, "loss": 0.2264, "num_tokens": 1022799.0, "reward": 5.925000190734863, "reward_std": 4.150000095367432, "rewards/reward_combined/mean": 5.925000190734863, "rewards/reward_combined/std": 4.150000095367432, "step": 3403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 63.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 8.025931358337402, "kl": 0.09706158936023712, "learning_rate": 3.665e-06, "loss": 0.3715, "num_tokens": 1023058.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 63.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019693896174430847, "kl": 0.00014501064288197085, "learning_rate": 3.664444444444445e-06, "loss": 0.0, "num_tokens": 1023330.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 63.074074074074076, "frac_reward_zero_std": 0.0, "grad_norm": 2.023855209350586, "kl": 0.13454778864979744, "learning_rate": 3.663888888888889e-06, "loss": -0.0315, "num_tokens": 1023653.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 63.092592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0255223847925663, "kl": 0.0034106895327568054, "learning_rate": 3.6633333333333336e-06, "loss": 0.0002, "num_tokens": 1023913.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 63.111111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 2.4144246578216553, "kl": 0.5016429796814919, "learning_rate": 3.662777777777778e-06, "loss": 0.0254, "num_tokens": 1024244.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 63.129629629629626, "frac_reward_zero_std": 0.0, "grad_norm": 3.3299496173858643, "kl": 0.024085860466584563, "learning_rate": 3.6622222222222227e-06, "loss": -0.0879, "num_tokens": 1024564.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 63.148148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015039920981507748, "kl": 4.296749830245972e-05, "learning_rate": 3.6616666666666666e-06, "loss": 0.0, "num_tokens": 1024784.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 63.166666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.04222999885678291, "kl": 0.01892260182648897, "learning_rate": 3.6611111111111114e-06, "loss": 0.001, "num_tokens": 1025078.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 63.18518518518518, "frac_reward_zero_std": 1.0, "grad_norm": 0.1169326975941658, "kl": 0.12420575320720673, "learning_rate": 3.660555555555556e-06, "loss": 0.0062, "num_tokens": 1025379.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.010416666977107525, "clip_ratio/region_mean": 0.010416666977107525, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 63.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.141938209533691, "kl": 0.04829752817749977, "learning_rate": 3.66e-06, "loss": 0.0574, "num_tokens": 1025721.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 3413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 63.22222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.03264474496245384, "kl": 0.03928976133465767, "learning_rate": 3.659444444444445e-06, "loss": 0.002, "num_tokens": 1026049.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 63.24074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.6268746852874756, "kl": 0.08984493091702461, "learning_rate": 3.6588888888888892e-06, "loss": 0.0339, "num_tokens": 1026404.0, "reward": 4.5, "reward_std": 2.0, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.0, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 63.25925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 11.364824295043945, "kl": 0.1474447138607502, "learning_rate": 3.6583333333333336e-06, "loss": 0.1424, "num_tokens": 1026653.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 3416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 63.27777777777778, "frac_reward_zero_std": 1.0, "grad_norm": 0.008993592113256454, "kl": 0.0008268636011052877, "learning_rate": 3.657777777777778e-06, "loss": 0.0, "num_tokens": 1026967.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 63.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00995576661080122, "kl": 0.0014955937513150275, "learning_rate": 3.6572222222222227e-06, "loss": 0.0001, "num_tokens": 1027185.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 63.31481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.29502251744270325, "kl": 0.05646159127354622, "learning_rate": 3.6566666666666667e-06, "loss": 0.0033, "num_tokens": 1027395.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 63.333333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.08066502213478088, "kl": 0.027980757877230644, "learning_rate": 3.6561111111111114e-06, "loss": 0.0014, "num_tokens": 1027724.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 63.351851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.037554964423179626, "kl": 0.01654118113219738, "learning_rate": 3.6555555555555562e-06, "loss": 0.0008, "num_tokens": 1027998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 63.370370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.005308639723807573, "kl": 0.0014436364290304482, "learning_rate": 3.655e-06, "loss": 0.0001, "num_tokens": 1028252.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 63.388888888888886, "frac_reward_zero_std": 1.0, "grad_norm": 0.012933995574712753, "kl": 0.21441413462162018, "learning_rate": 3.654444444444445e-06, "loss": 0.0107, "num_tokens": 1028556.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 63.407407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.004752307198941708, "kl": 0.00045444071292877197, "learning_rate": 3.653888888888889e-06, "loss": 0.0, "num_tokens": 1028768.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 63.425925925925924, "frac_reward_zero_std": 1.0, "grad_norm": 0.06267320364713669, "kl": 0.012393136508762836, "learning_rate": 3.6533333333333336e-06, "loss": 0.0006, "num_tokens": 1029064.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 63.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 0.8660312294960022, "kl": 0.14519864693284035, "learning_rate": 3.652777777777778e-06, "loss": 0.0072, "num_tokens": 1029428.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 3426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 63.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02328413724899292, "kl": 0.002630763512570411, "learning_rate": 3.6522222222222224e-06, "loss": 0.0001, "num_tokens": 1029707.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 63.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.030178679153323174, "kl": 0.02811182290315628, "learning_rate": 3.6516666666666667e-06, "loss": 0.0014, "num_tokens": 1029923.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 63.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.2171874046325684, "kl": 0.06329800002276897, "learning_rate": 3.6511111111111115e-06, "loss": 0.001, "num_tokens": 1030259.0, "reward": 4.375, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 3.902456521987915, "step": 3429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 63.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.004545693751424551, "kl": 0.0010372151737101376, "learning_rate": 3.6505555555555563e-06, "loss": 0.0001, "num_tokens": 1030578.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 63.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.011416622437536716, "kl": 0.0010245293378829956, "learning_rate": 3.65e-06, "loss": 0.0001, "num_tokens": 1030790.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 63.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.025983966886997223, "kl": 0.002172470063669607, "learning_rate": 3.649444444444445e-06, "loss": 0.0001, "num_tokens": 1031026.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 63.574074074074076, "frac_reward_zero_std": 1.0, "grad_norm": 0.05055323615670204, "kl": 0.043458595871925354, "learning_rate": 3.648888888888889e-06, "loss": 0.0022, "num_tokens": 1031483.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 63.592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.010059303604066372, "kl": 0.01501404121518135, "learning_rate": 3.6483333333333337e-06, "loss": 0.0008, "num_tokens": 1031795.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 63.611111111111114, "frac_reward_zero_std": 1.0, "grad_norm": 0.029378216713666916, "kl": 0.001078117056749761, "learning_rate": 3.647777777777778e-06, "loss": 0.0001, "num_tokens": 1032067.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 63.629629629629626, "frac_reward_zero_std": 1.0, "grad_norm": 0.02704840898513794, "kl": 0.021503036841750145, "learning_rate": 3.6472222222222224e-06, "loss": 0.001, "num_tokens": 1032423.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 63.648148148148145, "frac_reward_zero_std": 1.0, "grad_norm": 0.005896145943552256, "kl": 0.00017545169248478487, "learning_rate": 3.6466666666666668e-06, "loss": 0.0, "num_tokens": 1032719.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 63.666666666666664, "frac_reward_zero_std": 1.0, "grad_norm": 0.009935243986546993, "kl": 0.016208473592996597, "learning_rate": 3.6461111111111115e-06, "loss": 0.0008, "num_tokens": 1032979.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 63.68518518518518, "frac_reward_zero_std": 0.0, "grad_norm": 2.938436269760132, "kl": 0.025851076235994697, "learning_rate": 3.645555555555556e-06, "loss": 0.1794, "num_tokens": 1033283.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 63.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 7.184455871582031, "kl": 0.06319915316998959, "learning_rate": 3.6450000000000003e-06, "loss": 0.0265, "num_tokens": 1033548.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 63.72222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.03177766129374504, "kl": 0.00907775666564703, "learning_rate": 3.644444444444445e-06, "loss": 0.0004, "num_tokens": 1033868.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 63.74074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10806180536746979, "kl": 0.016441165935248137, "learning_rate": 3.643888888888889e-06, "loss": 0.0008, "num_tokens": 1034134.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 63.75925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.24464640021324158, "kl": 0.05451287887990475, "learning_rate": 3.6433333333333337e-06, "loss": 0.0029, "num_tokens": 1034421.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 63.77777777777778, "frac_reward_zero_std": 0.0, "grad_norm": 2.071403741836548, "kl": 0.4626026041805744, "learning_rate": 3.6427777777777777e-06, "loss": 0.0218, "num_tokens": 1034790.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 63.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1711988002061844, "kl": 0.028164531104266644, "learning_rate": 3.6422222222222225e-06, "loss": 0.0014, "num_tokens": 1035074.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 63.81481481481482, "frac_reward_zero_std": 1.0, "grad_norm": 0.03314780071377754, "kl": 0.003450791002251208, "learning_rate": 3.6416666666666672e-06, "loss": 0.0002, "num_tokens": 1035378.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 63.833333333333336, "frac_reward_zero_std": 1.0, "grad_norm": 0.015785573050379753, "kl": 0.1699732318520546, "learning_rate": 3.641111111111111e-06, "loss": 0.0085, "num_tokens": 1035687.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 63.851851851851855, "frac_reward_zero_std": 1.0, "grad_norm": 0.04916204884648323, "kl": 0.011550935916602612, "learning_rate": 3.640555555555556e-06, "loss": 0.0006, "num_tokens": 1035959.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 63.870370370370374, "frac_reward_zero_std": 1.0, "grad_norm": 0.13697466254234314, "kl": 0.026882126927375793, "learning_rate": 3.6400000000000003e-06, "loss": 0.0014, "num_tokens": 1036260.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 63.888888888888886, "frac_reward_zero_std": 0.0, "grad_norm": 2.849111318588257, "kl": 0.06913641840219498, "learning_rate": 3.639444444444445e-06, "loss": 0.04, "num_tokens": 1036622.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 63.907407407407405, "frac_reward_zero_std": 1.0, "grad_norm": 0.046458806842565536, "kl": 0.014837665483355522, "learning_rate": 3.638888888888889e-06, "loss": 0.0007, "num_tokens": 1036902.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 63.925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 2.909067153930664, "kl": 0.05603984370827675, "learning_rate": 3.638333333333334e-06, "loss": -0.1154, "num_tokens": 1037223.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 63.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.015269999392330647, "kl": 0.028650594875216484, "learning_rate": 3.6377777777777777e-06, "loss": 0.0014, "num_tokens": 1037491.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 63.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.11426541954278946, "kl": 0.011232744669541717, "learning_rate": 3.6372222222222225e-06, "loss": 0.0006, "num_tokens": 1037760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 63.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.040992386639118195, "kl": 0.0032591187555226497, "learning_rate": 3.6366666666666673e-06, "loss": 0.0002, "num_tokens": 1038017.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 64.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.058001477271318436, "kl": 0.007848855573683977, "learning_rate": 3.6361111111111112e-06, "loss": 0.0004, "num_tokens": 1038279.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 64.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.001774320495314896, "kl": 0.0903722420334816, "learning_rate": 3.635555555555556e-06, "loss": 0.0045, "num_tokens": 1038643.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 64.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.011684424243867397, "kl": 0.2142346203327179, "learning_rate": 3.6350000000000003e-06, "loss": 0.0107, "num_tokens": 1038947.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 64.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.075360298156738, "kl": 0.010756616480648518, "learning_rate": 3.6344444444444447e-06, "loss": 0.0341, "num_tokens": 1039225.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 3459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 64.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 4.484482765197754, "kl": 0.059267494129016995, "learning_rate": 3.633888888888889e-06, "loss": 0.2015, "num_tokens": 1039499.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 64.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023198802955448627, "kl": 0.0002818733482854441, "learning_rate": 3.633333333333334e-06, "loss": 0.0, "num_tokens": 1039755.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 64.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.011581243947148323, "kl": 0.014544166624546051, "learning_rate": 3.6327777777777778e-06, "loss": 0.0007, "num_tokens": 1040067.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004091101232916117, "kl": 0.0004126429557800293, "learning_rate": 3.6322222222222226e-06, "loss": 0.0, "num_tokens": 1040279.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3463 }, { "clip_ratio/high_max": 0.005494505632668734, "clip_ratio/high_mean": 0.005494505632668734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005494505632668734, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 64.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.4266912937164307, "kl": 0.19108977913856506, "learning_rate": 3.6316666666666673e-06, "loss": -0.0083, "num_tokens": 1040616.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 3464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 64.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.025369931012392044, "kl": 0.008330741431564093, "learning_rate": 3.6311111111111113e-06, "loss": 0.0004, "num_tokens": 1040944.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 64.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 15.923730850219727, "kl": 0.02966093533905223, "learning_rate": 3.630555555555556e-06, "loss": 0.2223, "num_tokens": 1041171.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 64.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 4.548930644989014, "kl": 0.48011159896850586, "learning_rate": 3.6300000000000004e-06, "loss": -0.0825, "num_tokens": 1041468.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 64.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.013245752081274986, "kl": 0.0008652522519696504, "learning_rate": 3.6294444444444448e-06, "loss": 0.0, "num_tokens": 1041702.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 64.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.856318235397339, "kl": 0.08136125281453133, "learning_rate": 3.628888888888889e-06, "loss": 0.0171, "num_tokens": 1042038.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 64.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.16955411434173584, "kl": 0.024819863960146904, "learning_rate": 3.628333333333334e-06, "loss": 0.0012, "num_tokens": 1042306.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 64.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.014280728064477444, "kl": 0.015118499752134085, "learning_rate": 3.627777777777778e-06, "loss": 0.0008, "num_tokens": 1042566.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 64.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.08264398574829102, "kl": 0.02987201325595379, "learning_rate": 3.6272222222222226e-06, "loss": 0.0015, "num_tokens": 1042894.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 64.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.140119791030884, "kl": 0.38865772634744644, "learning_rate": 3.6266666666666674e-06, "loss": 0.0212, "num_tokens": 1043233.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 3473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 64.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 7.73644495010376, "kl": 0.29065240919589996, "learning_rate": 3.6261111111111113e-06, "loss": 0.0852, "num_tokens": 1043521.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 64.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05028565973043442, "kl": 0.004957215976901352, "learning_rate": 3.625555555555556e-06, "loss": 0.0003, "num_tokens": 1043803.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 64.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10976999253034592, "kl": 0.014098451007157564, "learning_rate": 3.625e-06, "loss": 0.0007, "num_tokens": 1044102.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 64.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06392194330692291, "kl": 0.045382535085082054, "learning_rate": 3.624444444444445e-06, "loss": 0.0024, "num_tokens": 1044448.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.75, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 64.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.5782063007354736, "kl": 0.05625065974891186, "learning_rate": 3.623888888888889e-06, "loss": 0.2302, "num_tokens": 1044839.0, "reward": 6.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.674234628677368, "step": 3478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 64.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.009211399592459202, "kl": 0.00154626410221681, "learning_rate": 3.6233333333333335e-06, "loss": 0.0001, "num_tokens": 1045148.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 64.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.1371853351593018, "kl": 0.13426256366074085, "learning_rate": 3.622777777777778e-06, "loss": 0.1064, "num_tokens": 1045485.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 64.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 1.7415982484817505, "kl": 0.3433784395456314, "learning_rate": 3.6222222222222226e-06, "loss": 0.0177, "num_tokens": 1045829.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 64.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.16299718618392944, "kl": 0.03442233055830002, "learning_rate": 3.6216666666666674e-06, "loss": 0.0018, "num_tokens": 1046094.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.2888355851173401, "kl": 0.046954572200775146, "learning_rate": 3.6211111111111114e-06, "loss": 0.0026, "num_tokens": 1046313.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 58.75, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 64.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.894339919090271, "kl": 0.015368820168077946, "learning_rate": 3.620555555555556e-06, "loss": 0.4272, "num_tokens": 1046780.0, "reward": 5.300000190734863, "reward_std": 5.400000095367432, "rewards/reward_combined/mean": 5.300000190734863, "rewards/reward_combined/std": 5.40000057220459, "step": 3484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 64.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03386625647544861, "kl": 0.0030463114380836487, "learning_rate": 3.62e-06, "loss": 0.0002, "num_tokens": 1047040.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 64.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.1871435046195984, "kl": 0.037888760678470135, "learning_rate": 3.619444444444445e-06, "loss": 0.0019, "num_tokens": 1047306.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 64.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.014099165797233582, "kl": 0.0014913061168044806, "learning_rate": 3.618888888888889e-06, "loss": 0.0001, "num_tokens": 1047572.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 64.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.12556293606758118, "kl": 0.055631248280406, "learning_rate": 3.6183333333333336e-06, "loss": 0.0028, "num_tokens": 1047901.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 64.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0503348745405674, "kl": 0.02362844254821539, "learning_rate": 3.617777777777778e-06, "loss": 0.0012, "num_tokens": 1048199.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 64.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06877920776605606, "kl": 0.03963305614888668, "learning_rate": 3.6172222222222227e-06, "loss": 0.0019, "num_tokens": 1048529.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 64.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.012511859647929668, "kl": 0.004562696267385036, "learning_rate": 3.616666666666667e-06, "loss": 0.0002, "num_tokens": 1048817.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 64.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 2.848395347595215, "kl": 0.7961670458316803, "learning_rate": 3.6161111111111114e-06, "loss": 0.0398, "num_tokens": 1049025.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 64.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.135899066925049, "kl": 0.016112995333969593, "learning_rate": 3.615555555555556e-06, "loss": -0.0078, "num_tokens": 1049350.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 64.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.014952090568840504, "kl": 0.0012537389993667603, "learning_rate": 3.615e-06, "loss": 0.0001, "num_tokens": 1049562.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 64.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014673092402517796, "kl": 0.028803083114326, "learning_rate": 3.614444444444445e-06, "loss": 0.0014, "num_tokens": 1049830.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 64.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.01848515309393406, "kl": 0.0021760740783065557, "learning_rate": 3.613888888888889e-06, "loss": 0.0001, "num_tokens": 1050108.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 64.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.057626575231552124, "kl": 0.01137112407013774, "learning_rate": 3.6133333333333336e-06, "loss": 0.0006, "num_tokens": 1050410.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 64.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04084748029708862, "kl": 0.04562484472990036, "learning_rate": 3.612777777777778e-06, "loss": 0.0023, "num_tokens": 1050862.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014328921679407358, "kl": 4.343688488006592e-05, "learning_rate": 3.6122222222222223e-06, "loss": 0.0, "num_tokens": 1051082.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 64.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.08401252329349518, "kl": 0.022543327882885933, "learning_rate": 3.611666666666667e-06, "loss": 0.0012, "num_tokens": 1051368.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 64.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.023313576355576515, "kl": 0.16719020158052444, "learning_rate": 3.6111111111111115e-06, "loss": 0.0084, "num_tokens": 1051678.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 64.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009468886069953442, "kl": 0.0030659688636660576, "learning_rate": 3.6105555555555562e-06, "loss": 0.0002, "num_tokens": 1051962.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.002282497240230441, "kl": 0.010842852294445038, "learning_rate": 3.61e-06, "loss": 0.0005, "num_tokens": 1052198.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 64.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07316410541534424, "kl": 0.02648860216140747, "learning_rate": 3.609444444444445e-06, "loss": 0.0012, "num_tokens": 1052530.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 64.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.5859811305999756, "kl": 0.2175716906785965, "learning_rate": 3.608888888888889e-06, "loss": -0.0164, "num_tokens": 1052880.0, "reward": 5.625, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 2.462214469909668, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 64.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06984540820121765, "kl": 0.016948397271335125, "learning_rate": 3.6083333333333337e-06, "loss": 0.0009, "num_tokens": 1053154.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 64.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.033716753125190735, "kl": 0.003001514822244644, "learning_rate": 3.607777777777778e-06, "loss": 0.0002, "num_tokens": 1053398.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 64.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03225850313901901, "kl": 0.007359512150287628, "learning_rate": 3.6072222222222224e-06, "loss": 0.0005, "num_tokens": 1053654.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 64.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.022602150216698647, "kl": 0.0016214643546845764, "learning_rate": 3.606666666666667e-06, "loss": 0.0001, "num_tokens": 1053924.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.027418503537774086, "kl": 0.0019748391350731254, "learning_rate": 3.6061111111111115e-06, "loss": 0.0001, "num_tokens": 1054247.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 65.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.507833480834961, "kl": 0.09608860686421394, "learning_rate": 3.605555555555556e-06, "loss": 0.0627, "num_tokens": 1054609.0, "reward": 4.125, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.308422088623047, "step": 3511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 65.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07456950098276138, "kl": 0.013651876710355282, "learning_rate": 3.6050000000000002e-06, "loss": 0.0007, "num_tokens": 1054918.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 65.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.14582647383213043, "kl": 0.03057588543742895, "learning_rate": 3.604444444444445e-06, "loss": 0.0016, "num_tokens": 1055187.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 65.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06746110320091248, "kl": 0.007857197895646095, "learning_rate": 3.603888888888889e-06, "loss": 0.0004, "num_tokens": 1055471.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 65.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01995133049786091, "kl": 0.013683590572327375, "learning_rate": 3.6033333333333337e-06, "loss": 0.0007, "num_tokens": 1055731.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 65.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02681112289428711, "kl": 0.004282777197659016, "learning_rate": 3.6027777777777776e-06, "loss": 0.0002, "num_tokens": 1056015.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 65.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.17410871386528015, "kl": 0.04974578693509102, "learning_rate": 3.6022222222222224e-06, "loss": 0.0026, "num_tokens": 1056367.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.016318438574671745, "kl": 0.02617364190518856, "learning_rate": 3.601666666666667e-06, "loss": 0.0013, "num_tokens": 1056635.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 65.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.5122478008270264, "kl": 0.14498820900917053, "learning_rate": 3.601111111111111e-06, "loss": 0.0887, "num_tokens": 1056986.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 65.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.011465754359960556, "kl": 0.01437196135520935, "learning_rate": 3.600555555555556e-06, "loss": 0.0007, "num_tokens": 1057298.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 65.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.009844961576163769, "kl": 0.0028815865516662598, "learning_rate": 3.6000000000000003e-06, "loss": 0.0001, "num_tokens": 1057542.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.024480687454342842, "kl": 0.012664864349062555, "learning_rate": 3.599444444444445e-06, "loss": 0.0006, "num_tokens": 1057840.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 65.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.009755557402968407, "kl": 0.0007286093168659136, "learning_rate": 3.598888888888889e-06, "loss": 0.0, "num_tokens": 1058075.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 65.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.464174747467041, "kl": 0.044224596582353115, "learning_rate": 3.5983333333333338e-06, "loss": 0.0202, "num_tokens": 1058397.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 3524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 65.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.14824271202087402, "kl": 0.02311795437708497, "learning_rate": 3.5977777777777777e-06, "loss": 0.0011, "num_tokens": 1058665.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 65.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0352395623922348, "kl": 0.03973138611763716, "learning_rate": 3.5972222222222225e-06, "loss": 0.002, "num_tokens": 1058992.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 65.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 1.8284852504730225, "kl": 0.08240636810660362, "learning_rate": 3.5966666666666672e-06, "loss": 0.1416, "num_tokens": 1059348.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 3527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 65.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03540394827723503, "kl": 0.00720764696598053, "learning_rate": 3.596111111111111e-06, "loss": 0.0003, "num_tokens": 1059640.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 65.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07928937673568726, "kl": 0.23161036521196365, "learning_rate": 3.595555555555556e-06, "loss": 0.0116, "num_tokens": 1059944.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 65.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.023417619988322258, "kl": 0.039117004722356796, "learning_rate": 3.5950000000000003e-06, "loss": 0.002, "num_tokens": 1060399.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 65.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006830081343650818, "kl": 0.0007846057415008545, "learning_rate": 3.5944444444444447e-06, "loss": 0.0, "num_tokens": 1060611.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05053897202014923, "kl": 0.010882181115448475, "learning_rate": 3.593888888888889e-06, "loss": 0.0005, "num_tokens": 1060932.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 65.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.09092903137207, "kl": 0.035991473123431206, "learning_rate": 3.593333333333334e-06, "loss": 0.1117, "num_tokens": 1061241.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 65.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09388772398233414, "kl": 0.0029056668281555176, "learning_rate": 3.5927777777777777e-06, "loss": 0.0001, "num_tokens": 1061449.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 65.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05789894610643387, "kl": 0.004965706495568156, "learning_rate": 3.5922222222222225e-06, "loss": 0.0002, "num_tokens": 1061714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 65.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02826298587024212, "kl": 0.0005399539950303733, "learning_rate": 3.5916666666666673e-06, "loss": 0.0, "num_tokens": 1061927.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 65.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023315241560339928, "kl": 0.01082899421453476, "learning_rate": 3.5911111111111112e-06, "loss": 0.0005, "num_tokens": 1062163.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 65.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.035684626549482346, "kl": 0.01076507568359375, "learning_rate": 3.590555555555556e-06, "loss": 0.0005, "num_tokens": 1062451.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.018490666523575783, "kl": 0.008390807546675205, "learning_rate": 3.5900000000000004e-06, "loss": 0.0004, "num_tokens": 1062739.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.013960250653326511, "kl": 0.009729968383908272, "learning_rate": 3.5894444444444447e-06, "loss": 0.0005, "num_tokens": 1063011.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 65.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 7.766412258148193, "kl": 0.01864018029300496, "learning_rate": 3.588888888888889e-06, "loss": 0.248, "num_tokens": 1063240.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 65.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.040574789047241, "kl": 0.1141183190047741, "learning_rate": 3.588333333333334e-06, "loss": -0.1783, "num_tokens": 1063581.0, "reward": 3.125, "reward_std": 1.4361406564712524, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.4361406564712524, "step": 3542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 65.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02722400613129139, "kl": 0.0011793615412898362, "learning_rate": 3.5877777777777778e-06, "loss": 0.0001, "num_tokens": 1063899.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.5273611545562744, "kl": 0.16629290580749512, "learning_rate": 3.5872222222222226e-06, "loss": -0.0063, "num_tokens": 1064218.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 3544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 65.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 7.052092552185059, "kl": 0.17645391821861267, "learning_rate": 3.5866666666666673e-06, "loss": 0.389, "num_tokens": 1064457.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 65.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.12307872623205185, "kl": 0.16924121230840683, "learning_rate": 3.5861111111111113e-06, "loss": 0.0085, "num_tokens": 1064774.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 65.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03669394180178642, "kl": 0.04728009179234505, "learning_rate": 3.585555555555556e-06, "loss": 0.0022, "num_tokens": 1065098.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 65.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014778602053411305, "kl": 3.9346516132354736e-05, "learning_rate": 3.585e-06, "loss": 0.0, "num_tokens": 1065318.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 65.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.072556734085083, "kl": 0.34964896738529205, "learning_rate": 3.5844444444444448e-06, "loss": -0.0826, "num_tokens": 1065629.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 65.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.06049913540482521, "kl": 0.022634904831647873, "learning_rate": 3.583888888888889e-06, "loss": 0.0011, "num_tokens": 1065955.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 65.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.7587454319000244, "kl": 0.07705311477184296, "learning_rate": 3.5833333333333335e-06, "loss": -0.0087, "num_tokens": 1066286.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.13468530774116516, "kl": 0.027485518716275692, "learning_rate": 3.582777777777778e-06, "loss": 0.0013, "num_tokens": 1066574.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 65.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.056047458201646805, "kl": 0.004154827445745468, "learning_rate": 3.5822222222222226e-06, "loss": 0.0002, "num_tokens": 1066834.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 65.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.10515999048948288, "kl": 0.009520506020635366, "learning_rate": 3.5816666666666674e-06, "loss": 0.0005, "num_tokens": 1067106.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 65.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.001105761039070785, "kl": 0.00013324618339538574, "learning_rate": 3.5811111111111113e-06, "loss": 0.0, "num_tokens": 1067362.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.025802915915846825, "kl": 0.0018368143355473876, "learning_rate": 3.580555555555556e-06, "loss": 0.0001, "num_tokens": 1067682.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 65.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 1.445748209953308, "kl": 0.19610746204853058, "learning_rate": 3.58e-06, "loss": 0.0117, "num_tokens": 1067947.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 65.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.636611461639404, "kl": 0.24728038907051086, "learning_rate": 3.579444444444445e-06, "loss": -0.07, "num_tokens": 1068252.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 3558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.08418147265911102, "kl": 0.025534145534038544, "learning_rate": 3.578888888888889e-06, "loss": 0.0013, "num_tokens": 1068516.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 65.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.018245887011289597, "kl": 0.01107862126082182, "learning_rate": 3.5783333333333335e-06, "loss": 0.0006, "num_tokens": 1068788.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 65.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0249947402626276, "kl": 0.0009785071015357971, "learning_rate": 3.577777777777778e-06, "loss": 0.0, "num_tokens": 1069060.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 65.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.005539674311876297, "kl": 0.0018142893677577376, "learning_rate": 3.5772222222222227e-06, "loss": 0.0001, "num_tokens": 1069337.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 65.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016040202463045716, "kl": 0.09042217582464218, "learning_rate": 3.576666666666667e-06, "loss": 0.0045, "num_tokens": 1069701.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 66.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.33412447571754456, "kl": 0.050757985562086105, "learning_rate": 3.5761111111111114e-06, "loss": 0.0025, "num_tokens": 1069961.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 66.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.14492028951644897, "kl": 0.03262025024741888, "learning_rate": 3.575555555555556e-06, "loss": 0.0016, "num_tokens": 1070295.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.8714897632598877, "kl": 0.008798012044280767, "learning_rate": 3.575e-06, "loss": 0.1668, "num_tokens": 1070582.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 3566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 66.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.991020202636719, "kl": 0.2446276992559433, "learning_rate": 3.574444444444445e-06, "loss": -0.0094, "num_tokens": 1070895.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 66.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.017315497621893883, "kl": 0.014369001146405935, "learning_rate": 3.5738888888888888e-06, "loss": 0.0007, "num_tokens": 1071155.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 66.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.198134660720825, "kl": 0.16601085662841797, "learning_rate": 3.5733333333333336e-06, "loss": -0.0554, "num_tokens": 1071476.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 66.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.016439255326986313, "kl": 0.0012131542025599629, "learning_rate": 3.5727777777777783e-06, "loss": 0.0001, "num_tokens": 1071730.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 66.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.169734477996826, "kl": 0.188784159719944, "learning_rate": 3.5722222222222223e-06, "loss": 0.0498, "num_tokens": 1072041.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 66.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.448599100112915, "kl": 0.15254057943820953, "learning_rate": 3.571666666666667e-06, "loss": -0.0692, "num_tokens": 1072399.0, "reward": 4.5, "reward_std": 2.345207929611206, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.345207929611206, "step": 3572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 66.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.047783441841602325, "kl": 0.019293994642794132, "learning_rate": 3.5711111111111114e-06, "loss": 0.0009, "num_tokens": 1072782.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 66.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.09952576458454132, "kl": 0.05507629411295056, "learning_rate": 3.570555555555556e-06, "loss": 0.0028, "num_tokens": 1073054.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 66.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.010869992896914482, "kl": 0.0146358422935009, "learning_rate": 3.57e-06, "loss": 0.0007, "num_tokens": 1073366.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 66.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016165610868483782, "kl": 0.09044806659221649, "learning_rate": 3.569444444444445e-06, "loss": 0.0045, "num_tokens": 1073730.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 66.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.09354139864444733, "kl": 0.022881975397467613, "learning_rate": 3.568888888888889e-06, "loss": 0.0011, "num_tokens": 1074021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 66.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 8.371512413024902, "kl": 0.07995305955410004, "learning_rate": 3.5683333333333336e-06, "loss": 0.1985, "num_tokens": 1074301.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 66.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.004898715298622847, "kl": 0.0009453415987081826, "learning_rate": 3.5677777777777784e-06, "loss": 0.0, "num_tokens": 1074521.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 66.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.023645708337426186, "kl": 0.21283665299415588, "learning_rate": 3.5672222222222223e-06, "loss": 0.0106, "num_tokens": 1074827.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 66.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.036213845014572144, "kl": 0.04374518617987633, "learning_rate": 3.566666666666667e-06, "loss": 0.0022, "num_tokens": 1075280.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 66.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04613799601793289, "kl": 0.02412213757634163, "learning_rate": 3.5661111111111115e-06, "loss": 0.0012, "num_tokens": 1075548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 66.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01902182400226593, "kl": 0.008124420419335365, "learning_rate": 3.565555555555556e-06, "loss": 0.0004, "num_tokens": 1075836.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 66.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010801077587530017, "kl": 0.0004565551789710298, "learning_rate": 3.565e-06, "loss": 0.0, "num_tokens": 1076152.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 66.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02208133600652218, "kl": 0.002435125410556793, "learning_rate": 3.564444444444445e-06, "loss": 0.0001, "num_tokens": 1076412.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 66.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016130688891280442, "kl": 2.559274435043335e-05, "learning_rate": 3.563888888888889e-06, "loss": 0.0, "num_tokens": 1076632.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 66.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.19571705162525177, "kl": 0.05572357773780823, "learning_rate": 3.5633333333333337e-06, "loss": 0.0028, "num_tokens": 1076942.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 66.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.005511694587767124, "kl": 0.0003425776958465576, "learning_rate": 3.5627777777777784e-06, "loss": 0.0, "num_tokens": 1077154.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 92.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 92.5, "completions/mean_terminated_length": 38.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 66.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.6940491199493408, "kl": 0.03946111723780632, "learning_rate": 3.5622222222222224e-06, "loss": 0.3721, "num_tokens": 1077748.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 3589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 66.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09670713543891907, "kl": 0.012829385697841644, "learning_rate": 3.561666666666667e-06, "loss": 0.0006, "num_tokens": 1078016.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 66.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.5154953002929688, "kl": 0.11109286546707153, "learning_rate": 3.561111111111111e-06, "loss": 0.2347, "num_tokens": 1078405.0, "reward": 3.674999952316284, "reward_std": 2.7548441886901855, "rewards/reward_combined/mean": 3.674999952316284, "rewards/reward_combined/std": 2.7548444271087646, "step": 3591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 66.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.011575859971344471, "kl": 0.0003280073433415964, "learning_rate": 3.560555555555556e-06, "loss": 0.0, "num_tokens": 1078661.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 66.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09494850784540176, "kl": 0.00590978586114943, "learning_rate": 3.5600000000000002e-06, "loss": 0.0003, "num_tokens": 1078923.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 66.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.08889426290988922, "kl": 0.0034754425287246704, "learning_rate": 3.559444444444445e-06, "loss": 0.0002, "num_tokens": 1079133.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 66.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0110711008310318, "kl": 0.0024892576038837433, "learning_rate": 3.558888888888889e-06, "loss": 0.0001, "num_tokens": 1079377.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 66.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.781071186065674, "kl": 0.026522185187786818, "learning_rate": 3.5583333333333337e-06, "loss": 0.0263, "num_tokens": 1079721.0, "reward": 5.550000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 5.550000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 3596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 66.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.019578786566853523, "kl": 0.0018239851342514157, "learning_rate": 3.5577777777777785e-06, "loss": 0.0001, "num_tokens": 1079997.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.019043652340769768, "kl": 0.004002270521596074, "learning_rate": 3.5572222222222224e-06, "loss": 0.0002, "num_tokens": 1080297.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 66.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1594618707895279, "kl": 0.015581985004246235, "learning_rate": 3.556666666666667e-06, "loss": 0.0007, "num_tokens": 1080555.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.11619346588850021, "kl": 0.03712746128439903, "learning_rate": 3.556111111111111e-06, "loss": 0.0019, "num_tokens": 1080843.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 66.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.004567440133541822, "kl": 0.0009274303738493472, "learning_rate": 3.555555555555556e-06, "loss": 0.0, "num_tokens": 1081163.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 66.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 1.5665254592895508, "kl": 0.2911417931318283, "learning_rate": 3.5550000000000003e-06, "loss": 0.0146, "num_tokens": 1081483.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 66.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014047932345420122, "kl": 0.0017386930994689465, "learning_rate": 3.5544444444444446e-06, "loss": 0.0001, "num_tokens": 1081763.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019543117377907038, "kl": 0.002766240038909018, "learning_rate": 3.553888888888889e-06, "loss": 0.0001, "num_tokens": 1082047.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 66.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.12340641766786575, "kl": 0.040027398616075516, "learning_rate": 3.5533333333333338e-06, "loss": 0.0021, "num_tokens": 1082325.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 66.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.012724654749035835, "kl": 0.02693679928779602, "learning_rate": 3.552777777777778e-06, "loss": 0.0013, "num_tokens": 1082541.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 66.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.02928651124238968, "kl": 0.039788976311683655, "learning_rate": 3.5522222222222225e-06, "loss": 0.002, "num_tokens": 1082869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.01580074615776539, "kl": 0.0035497203934937716, "learning_rate": 3.5516666666666672e-06, "loss": 0.0002, "num_tokens": 1083161.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 66.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.01369897648692131, "kl": 0.00013612210750579834, "learning_rate": 3.551111111111111e-06, "loss": 0.0, "num_tokens": 1083373.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 66.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.029800571501255035, "kl": 0.0009474157996010035, "learning_rate": 3.550555555555556e-06, "loss": 0.0, "num_tokens": 1083641.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 66.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.003213412594050169, "kl": 0.01065998524427414, "learning_rate": 3.5500000000000003e-06, "loss": 0.0005, "num_tokens": 1083877.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 66.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.9334107637405396, "kl": 0.012754752766340971, "learning_rate": 3.5494444444444447e-06, "loss": 0.0264, "num_tokens": 1084197.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 66.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.08959430456161499, "kl": 0.0253981314599514, "learning_rate": 3.548888888888889e-06, "loss": 0.0013, "num_tokens": 1084497.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 66.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.00575705012306571, "kl": 0.0009295408090110868, "learning_rate": 3.548333333333334e-06, "loss": 0.0, "num_tokens": 1084732.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 66.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.4404389262199402, "kl": 0.0817864267155528, "learning_rate": 3.547777777777778e-06, "loss": 0.0037, "num_tokens": 1085068.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 66.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03779691830277443, "kl": 0.005656612222082913, "learning_rate": 3.5472222222222225e-06, "loss": 0.0003, "num_tokens": 1085366.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 66.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08805135637521744, "kl": 0.0243197581730783, "learning_rate": 3.5466666666666673e-06, "loss": 0.0013, "num_tokens": 1085646.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 67.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.13541562855243683, "kl": 0.018401571549475193, "learning_rate": 3.5461111111111112e-06, "loss": 0.0009, "num_tokens": 1085974.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 67.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.003807264845818281, "kl": 0.010540425777435303, "learning_rate": 3.545555555555556e-06, "loss": 0.0005, "num_tokens": 1086210.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 67.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.006257910281419754, "kl": 0.0005351901054382324, "learning_rate": 3.545e-06, "loss": 0.0, "num_tokens": 1086422.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 67.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04541511461138725, "kl": 0.0055177463218569756, "learning_rate": 3.5444444444444447e-06, "loss": 0.0003, "num_tokens": 1086684.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 67.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.8420366048812866, "kl": 0.2503596395254135, "learning_rate": 3.543888888888889e-06, "loss": -0.0781, "num_tokens": 1087047.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 67.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0953967422246933, "kl": 0.007997960899956524, "learning_rate": 3.5433333333333334e-06, "loss": 0.0004, "num_tokens": 1087311.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 67.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.19024302065372467, "kl": 0.10660567134618759, "learning_rate": 3.542777777777778e-06, "loss": 0.0053, "num_tokens": 1087655.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 67.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.033945560455322, "kl": 0.03562216181308031, "learning_rate": 3.5422222222222226e-06, "loss": 0.0438, "num_tokens": 1087975.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 67.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06592623889446259, "kl": 0.0072203699965029955, "learning_rate": 3.5416666666666673e-06, "loss": 0.0004, "num_tokens": 1088246.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 67.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03302471339702606, "kl": 0.029373289085924625, "learning_rate": 3.5411111111111113e-06, "loss": 0.0014, "num_tokens": 1088574.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 67.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.2083090990781784, "kl": 0.044301947578787804, "learning_rate": 3.540555555555556e-06, "loss": 0.0022, "num_tokens": 1088876.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 67.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03005102090537548, "kl": 0.0005638569709844887, "learning_rate": 3.54e-06, "loss": 0.0, "num_tokens": 1089132.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 67.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.013592023402452469, "kl": 0.0009575795847922564, "learning_rate": 3.5394444444444448e-06, "loss": 0.0, "num_tokens": 1089448.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 67.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 8.135224342346191, "kl": 0.04004454426467419, "learning_rate": 3.538888888888889e-06, "loss": 0.3944, "num_tokens": 1089694.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 67.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.058179132640361786, "kl": 0.007659344235435128, "learning_rate": 3.5383333333333335e-06, "loss": 0.0004, "num_tokens": 1089962.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 67.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.2578822672367096, "kl": 0.023563940078020096, "learning_rate": 3.5377777777777783e-06, "loss": 0.0012, "num_tokens": 1090206.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 67.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.07190565019845963, "kl": 0.025006237905472517, "learning_rate": 3.5372222222222226e-06, "loss": 0.0014, "num_tokens": 1090543.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 67.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016913486178964376, "kl": 0.09040756151080132, "learning_rate": 3.536666666666667e-06, "loss": 0.0045, "num_tokens": 1090907.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 67.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.011415771208703518, "kl": 0.015663007274270058, "learning_rate": 3.5361111111111113e-06, "loss": 0.0008, "num_tokens": 1091167.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 67.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.826112747192383, "kl": 0.06242197006940842, "learning_rate": 3.535555555555556e-06, "loss": -0.0593, "num_tokens": 1091505.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 3637 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.01315789483487606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01315789483487606, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 67.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.8394206762313843, "kl": 0.0577030498534441, "learning_rate": 3.535e-06, "loss": -0.0258, "num_tokens": 1091839.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 67.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.24175679683685303, "kl": 0.1890261396765709, "learning_rate": 3.534444444444445e-06, "loss": 0.0094, "num_tokens": 1092137.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 67.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.004534916486591101, "kl": 0.0006311565812211484, "learning_rate": 3.5338888888888887e-06, "loss": 0.0, "num_tokens": 1092397.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 67.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 0.8222032189369202, "kl": 0.07626905664801598, "learning_rate": 3.5333333333333335e-06, "loss": 0.0336, "num_tokens": 1092765.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 3641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 67.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.012324186973273754, "kl": 0.0015281662344932556, "learning_rate": 3.5327777777777783e-06, "loss": 0.0001, "num_tokens": 1092971.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 67.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015323515981435776, "kl": 1.7642974853515625e-05, "learning_rate": 3.5322222222222222e-06, "loss": 0.0, "num_tokens": 1093191.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 67.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.11311082541942596, "kl": 0.022042669006623328, "learning_rate": 3.531666666666667e-06, "loss": 0.0011, "num_tokens": 1093477.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 67.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.020769167691469193, "kl": 0.01349325617775321, "learning_rate": 3.5311111111111114e-06, "loss": 0.0007, "num_tokens": 1093808.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 67.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.026232676580548286, "kl": 0.011209603631868958, "learning_rate": 3.530555555555556e-06, "loss": 0.0007, "num_tokens": 1094129.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 67.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.015626095235347748, "kl": 0.0008884288909030147, "learning_rate": 3.53e-06, "loss": 0.0, "num_tokens": 1094407.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 67.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.011719479225575924, "kl": 0.009289513807743788, "learning_rate": 3.529444444444445e-06, "loss": 0.0005, "num_tokens": 1094679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 67.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.14997443556785583, "kl": 0.2559715360403061, "learning_rate": 3.528888888888889e-06, "loss": 0.0128, "num_tokens": 1094984.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 67.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03686206787824631, "kl": 0.007735989522188902, "learning_rate": 3.5283333333333336e-06, "loss": 0.0004, "num_tokens": 1095276.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 67.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.05293869972229, "kl": 0.21750066429376602, "learning_rate": 3.5277777777777784e-06, "loss": 0.1032, "num_tokens": 1095620.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 3651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 67.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06944465637207031, "kl": 0.002710455213673413, "learning_rate": 3.5272222222222223e-06, "loss": 0.0001, "num_tokens": 1095854.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 67.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.7553746700286865, "kl": 0.006453685462474823, "learning_rate": 3.526666666666667e-06, "loss": -0.0744, "num_tokens": 1096151.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 67.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016875548753887415, "kl": 0.001779134792741388, "learning_rate": 3.5261111111111114e-06, "loss": 0.0001, "num_tokens": 1096431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 67.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 8.706485748291016, "kl": 0.1276140883564949, "learning_rate": 3.5255555555555558e-06, "loss": 0.0225, "num_tokens": 1096704.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 67.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 12.052241325378418, "kl": 1.117118757974822, "learning_rate": 3.525e-06, "loss": -0.1918, "num_tokens": 1096921.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 3656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 67.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.1028142124414444, "kl": 0.06995443254709244, "learning_rate": 3.524444444444445e-06, "loss": 0.0035, "num_tokens": 1097248.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 67.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.018355101346969604, "kl": 0.004877616884186864, "learning_rate": 3.523888888888889e-06, "loss": 0.0002, "num_tokens": 1097548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 67.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 5.227734088897705, "kl": 0.017699570395052433, "learning_rate": 3.5233333333333336e-06, "loss": 0.2827, "num_tokens": 1097870.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 3659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 67.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.14862921833992004, "kl": 0.01813464518636465, "learning_rate": 3.5227777777777784e-06, "loss": 0.0009, "num_tokens": 1098130.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 67.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.13357220590114594, "kl": 0.033683608286082745, "learning_rate": 3.5222222222222223e-06, "loss": 0.0017, "num_tokens": 1098416.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 67.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.3000745177268982, "kl": 0.04156101495027542, "learning_rate": 3.521666666666667e-06, "loss": 0.002, "num_tokens": 1098696.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 67.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.7239532470703125, "kl": 0.26988372951745987, "learning_rate": 3.5211111111111115e-06, "loss": 0.0563, "num_tokens": 1099063.0, "reward": 5.0, "reward_std": 3.316624879837036, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.316624879837036, "step": 3663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 67.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01663338392972946, "kl": 0.0005131850484758615, "learning_rate": 3.520555555555556e-06, "loss": 0.0, "num_tokens": 1099333.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 67.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.049248822033405304, "kl": 0.0021760209929198027, "learning_rate": 3.52e-06, "loss": 0.0001, "num_tokens": 1099546.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 67.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 0.8920487761497498, "kl": 0.015725424513220787, "learning_rate": 3.519444444444445e-06, "loss": 0.0006, "num_tokens": 1099858.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 3666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 67.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.29580968618392944, "kl": 0.053568356786854565, "learning_rate": 3.518888888888889e-06, "loss": 0.0025, "num_tokens": 1100148.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 67.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03546999394893646, "kl": 0.16726671159267426, "learning_rate": 3.5183333333333337e-06, "loss": 0.0084, "num_tokens": 1100458.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 67.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 0.8529254198074341, "kl": 0.1869199201464653, "learning_rate": 3.5177777777777784e-06, "loss": 0.0469, "num_tokens": 1100905.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 3669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 67.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 1.028513789176941, "kl": 0.1925788652151823, "learning_rate": 3.5172222222222224e-06, "loss": 0.0101, "num_tokens": 1101179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 67.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08000446110963821, "kl": 0.015472561120986938, "learning_rate": 3.516666666666667e-06, "loss": 0.0009, "num_tokens": 1101505.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.8445615768432617, "kl": 0.02216933947056532, "learning_rate": 3.516111111111111e-06, "loss": -0.232, "num_tokens": 1101839.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 68.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.38906967639923096, "kl": 0.05073143634945154, "learning_rate": 3.515555555555556e-06, "loss": 0.0024, "num_tokens": 1102073.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 6.016326427459717, "kl": 0.09802917065098882, "learning_rate": 3.5150000000000002e-06, "loss": 0.3443, "num_tokens": 1102403.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 3674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 68.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.1252201348543167, "kl": 0.023190381936728954, "learning_rate": 3.5144444444444446e-06, "loss": 0.0012, "num_tokens": 1102721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 68.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.14260946214199066, "kl": 0.04828420467674732, "learning_rate": 3.513888888888889e-06, "loss": 0.0024, "num_tokens": 1103003.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 68.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08059034496545792, "kl": 0.018926447723060846, "learning_rate": 3.5133333333333337e-06, "loss": 0.001, "num_tokens": 1103343.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 68.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.020689386874437332, "kl": 0.0035027912817895412, "learning_rate": 3.5127777777777785e-06, "loss": 0.0002, "num_tokens": 1103620.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 68.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.013757583685219288, "kl": 0.013585182838141918, "learning_rate": 3.5122222222222224e-06, "loss": 0.0007, "num_tokens": 1103932.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 68.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10765332728624344, "kl": 0.025122812949121, "learning_rate": 3.511666666666667e-06, "loss": 0.0013, "num_tokens": 1104232.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 68.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03786454722285271, "kl": 0.0037151454962440766, "learning_rate": 3.511111111111111e-06, "loss": 0.0002, "num_tokens": 1104489.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 68.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.409121990203857, "kl": 0.007023252546787262, "learning_rate": 3.510555555555556e-06, "loss": 0.1003, "num_tokens": 1104753.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 68.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03461194038391113, "kl": 0.0022564732644241303, "learning_rate": 3.5100000000000003e-06, "loss": 0.0001, "num_tokens": 1105031.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 68.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.09251701831817627, "kl": 0.01447087386623025, "learning_rate": 3.5094444444444446e-06, "loss": 0.0007, "num_tokens": 1105299.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015434762462973595, "kl": 1.5445053577423096e-05, "learning_rate": 3.508888888888889e-06, "loss": 0.0, "num_tokens": 1105519.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 68.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.017901211977005005, "kl": 0.0005703210917999968, "learning_rate": 3.5083333333333338e-06, "loss": 0.0, "num_tokens": 1105785.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 68.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04937196150422096, "kl": 0.2099093645811081, "learning_rate": 3.507777777777778e-06, "loss": 0.01, "num_tokens": 1106098.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 68.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.694841146469116, "kl": 0.12012674659490585, "learning_rate": 3.5072222222222225e-06, "loss": -0.032, "num_tokens": 1106452.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 68.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.969334840774536, "kl": 0.31573934527114034, "learning_rate": 3.5066666666666673e-06, "loss": 0.0158, "num_tokens": 1106782.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 3689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 68.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.055963389575481415, "kl": 0.022963601164519787, "learning_rate": 3.506111111111111e-06, "loss": 0.0012, "num_tokens": 1107097.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 68.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09798284620046616, "kl": 0.044139228761196136, "learning_rate": 3.505555555555556e-06, "loss": 0.0023, "num_tokens": 1107369.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03948074206709862, "kl": 0.004211842082440853, "learning_rate": 3.505e-06, "loss": 0.0002, "num_tokens": 1107635.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 68.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.1148009300231934, "kl": 0.0701107457280159, "learning_rate": 3.5044444444444447e-06, "loss": 0.0215, "num_tokens": 1107986.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 3693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 68.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.006807222962379456, "kl": 0.0010594595805741847, "learning_rate": 3.5038888888888895e-06, "loss": 0.0001, "num_tokens": 1108305.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.18931210041046143, "kl": 0.03402386233210564, "learning_rate": 3.5033333333333334e-06, "loss": 0.0014, "num_tokens": 1108565.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 68.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 5.891815185546875, "kl": 0.03670062869787216, "learning_rate": 3.502777777777778e-06, "loss": 0.0497, "num_tokens": 1108861.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 68.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.8799819946289062, "kl": 0.0851309671998024, "learning_rate": 3.5022222222222225e-06, "loss": 0.0239, "num_tokens": 1109236.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 68.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.019286956638097763, "kl": 0.01259859511628747, "learning_rate": 3.5016666666666673e-06, "loss": 0.0006, "num_tokens": 1109497.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 68.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07569728046655655, "kl": 0.004416240844875574, "learning_rate": 3.5011111111111112e-06, "loss": 0.0002, "num_tokens": 1109706.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 68.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06637779623270035, "kl": 0.006878424566821195, "learning_rate": 3.500555555555556e-06, "loss": 0.0003, "num_tokens": 1109996.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 68.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.6083786487579346, "kl": 0.08128560334444046, "learning_rate": 3.5e-06, "loss": 0.0043, "num_tokens": 1110241.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 68.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02567235380411148, "kl": 0.035078201442956924, "learning_rate": 3.4994444444444447e-06, "loss": 0.0018, "num_tokens": 1110696.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 68.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.023987192660570145, "kl": 0.1676325723528862, "learning_rate": 3.4988888888888895e-06, "loss": 0.0084, "num_tokens": 1111005.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 68.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.96281361579895, "kl": 0.20731389708817005, "learning_rate": 3.4983333333333334e-06, "loss": 0.0811, "num_tokens": 1111338.0, "reward": 5.0, "reward_std": 3.316624879837036, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.316624879837036, "step": 3704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05134955421090126, "kl": 0.01437505753710866, "learning_rate": 3.4977777777777782e-06, "loss": 0.0007, "num_tokens": 1111622.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 13.265174865722656, "kl": 0.053065018728375435, "learning_rate": 3.4972222222222226e-06, "loss": 0.1362, "num_tokens": 1111855.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3706 }, { "clip_ratio/high_max": 0.011111111380159855, "clip_ratio/high_mean": 0.011111111380159855, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 68.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.155365943908691, "kl": 0.1507706269621849, "learning_rate": 3.496666666666667e-06, "loss": 0.0541, "num_tokens": 1112165.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 68.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.011713210493326187, "kl": 0.011179805267602205, "learning_rate": 3.4961111111111113e-06, "loss": 0.0006, "num_tokens": 1112471.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.14152024686336517, "kl": 0.010915258550085127, "learning_rate": 3.495555555555556e-06, "loss": 0.0005, "num_tokens": 1112690.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 68.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.4282476902008057, "kl": 0.07843845151364803, "learning_rate": 3.495e-06, "loss": 0.043, "num_tokens": 1113039.0, "reward": 4.5, "reward_std": 2.0, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.0, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 68.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04414372518658638, "kl": 0.009822174906730652, "learning_rate": 3.4944444444444448e-06, "loss": 0.0006, "num_tokens": 1113358.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 68.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.05426568165421486, "kl": 0.002056846977211535, "learning_rate": 3.4938888888888896e-06, "loss": 0.0001, "num_tokens": 1113574.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 68.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 7.4998884201049805, "kl": 0.7186867091804743, "learning_rate": 3.4933333333333335e-06, "loss": 0.1322, "num_tokens": 1113915.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 68.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.8977017402648926, "kl": 0.10615283995866776, "learning_rate": 3.4927777777777783e-06, "loss": 0.1389, "num_tokens": 1114278.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 3714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0685388520359993, "kl": 0.01687341369688511, "learning_rate": 3.492222222222222e-06, "loss": 0.0008, "num_tokens": 1114562.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 68.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.017143942415714264, "kl": 0.009853059891611338, "learning_rate": 3.491666666666667e-06, "loss": 0.0005, "num_tokens": 1114834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.009982370771467686, "kl": 0.009159393608570099, "learning_rate": 3.4911111111111113e-06, "loss": 0.0005, "num_tokens": 1115070.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 68.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.4144060611724854, "kl": 0.3329150974750519, "learning_rate": 3.490555555555556e-06, "loss": 0.0453, "num_tokens": 1115403.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01669352874159813, "kl": 0.004600953310728073, "learning_rate": 3.49e-06, "loss": 0.0002, "num_tokens": 1115707.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 68.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.00170939601957798, "kl": 0.09045297279953957, "learning_rate": 3.489444444444445e-06, "loss": 0.0045, "num_tokens": 1116071.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 68.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.004140973556786776, "kl": 7.459372136509046e-05, "learning_rate": 3.4888888888888896e-06, "loss": 0.0, "num_tokens": 1116284.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 68.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.21304234862327576, "kl": 0.030882075428962708, "learning_rate": 3.4883333333333335e-06, "loss": 0.0015, "num_tokens": 1116568.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 68.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.19098307192325592, "kl": 0.0883914865553379, "learning_rate": 3.4877777777777783e-06, "loss": 0.0043, "num_tokens": 1116896.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 68.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.058191776275635, "kl": 0.18757027387619019, "learning_rate": 3.4872222222222222e-06, "loss": 0.1798, "num_tokens": 1117164.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 72.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 72.75, "completions/mean_terminated_length": 11.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 68.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.745594024658203, "kl": 0.05983216827735305, "learning_rate": 3.486666666666667e-06, "loss": 0.2525, "num_tokens": 1117675.0, "reward": 5.875, "reward_std": 2.136000871658325, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 2.136000871658325, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 69.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06191493943333626, "kl": 0.02030244469642639, "learning_rate": 3.4861111111111114e-06, "loss": 0.001, "num_tokens": 1118007.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 69.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 1.0511596202850342, "kl": 0.15838118735700846, "learning_rate": 3.4855555555555557e-06, "loss": 0.0086, "num_tokens": 1118281.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 69.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.15896835923194885, "kl": 0.025477363727986813, "learning_rate": 3.485e-06, "loss": 0.0013, "num_tokens": 1118543.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 69.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.041578516364097595, "kl": 0.00923643633723259, "learning_rate": 3.484444444444445e-06, "loss": 0.0005, "num_tokens": 1118841.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 4.413516044616699, "kl": 0.13488477282226086, "learning_rate": 3.4838888888888892e-06, "loss": -0.0308, "num_tokens": 1119119.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 69.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.4531584978103638, "kl": 0.06753463298082352, "learning_rate": 3.4833333333333336e-06, "loss": 0.0195, "num_tokens": 1119475.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 3731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 69.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.5979576110839844, "kl": 0.037075428292155266, "learning_rate": 3.4827777777777784e-06, "loss": 0.1006, "num_tokens": 1119779.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 3732 }, { "clip_ratio/high_max": 0.02380952425301075, "clip_ratio/high_mean": 0.02380952425301075, "clip_ratio/low_mean": 0.017543859779834747, "clip_ratio/low_min": 0.017543859779834747, "clip_ratio/region_mean": 0.0413533840328455, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 69.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.2831544876098633, "kl": 0.06555721908807755, "learning_rate": 3.4822222222222223e-06, "loss": 0.0596, "num_tokens": 1120106.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 3733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 69.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.4112396538257599, "kl": 0.10254883393645287, "learning_rate": 3.481666666666667e-06, "loss": 0.0053, "num_tokens": 1120485.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 90.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 90.25, "completions/mean_terminated_length": 35.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 69.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.4116404056549072, "kl": 0.034873390570282936, "learning_rate": 3.4811111111111114e-06, "loss": 0.4302, "num_tokens": 1121066.0, "reward": 5.050000190734863, "reward_std": 4.900000095367432, "rewards/reward_combined/mean": 5.050000190734863, "rewards/reward_combined/std": 4.900000095367432, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.06162925809621811, "kl": 0.006070858595194295, "learning_rate": 3.4805555555555558e-06, "loss": 0.0003, "num_tokens": 1121354.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 69.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 4.5916008949279785, "kl": 0.18640215694904327, "learning_rate": 3.48e-06, "loss": 0.0623, "num_tokens": 1121667.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 69.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.032520048320293427, "kl": 0.011046601925045252, "learning_rate": 3.479444444444445e-06, "loss": 0.0005, "num_tokens": 1122001.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 69.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.009732295759022236, "kl": 0.008661193773150444, "learning_rate": 3.4788888888888893e-06, "loss": 0.0004, "num_tokens": 1122283.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 69.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.003264730330556631, "kl": 0.0027622650377452374, "learning_rate": 3.4783333333333336e-06, "loss": 0.0001, "num_tokens": 1122567.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 69.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.6151320934295654, "kl": 0.05862594395875931, "learning_rate": 3.4777777777777784e-06, "loss": -0.0231, "num_tokens": 1122914.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 69.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.00676839891821146, "kl": 0.0007748156785964966, "learning_rate": 3.4772222222222223e-06, "loss": 0.0, "num_tokens": 1123126.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 69.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.14126138389110565, "kl": 0.014181540347635746, "learning_rate": 3.476666666666667e-06, "loss": 0.0008, "num_tokens": 1123392.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 69.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.044669222086668015, "kl": 0.0027658119797706604, "learning_rate": 3.476111111111111e-06, "loss": 0.0001, "num_tokens": 1123602.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 69.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05537993833422661, "kl": 0.03703347593545914, "learning_rate": 3.475555555555556e-06, "loss": 0.0019, "num_tokens": 1123894.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 69.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.008672441355884075, "kl": 0.01651405543088913, "learning_rate": 3.475e-06, "loss": 0.0008, "num_tokens": 1124154.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 69.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014311624690890312, "kl": 0.09060052782297134, "learning_rate": 3.4744444444444445e-06, "loss": 0.0045, "num_tokens": 1124518.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.013111861422657967, "kl": 0.001251677516847849, "learning_rate": 3.4738888888888893e-06, "loss": 0.0001, "num_tokens": 1124835.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 69.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 8.789616584777832, "kl": 0.1354963816702366, "learning_rate": 3.4733333333333337e-06, "loss": 0.1311, "num_tokens": 1125078.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 69.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03910009562969208, "kl": 0.011721659451723099, "learning_rate": 3.4727777777777785e-06, "loss": 0.0006, "num_tokens": 1125390.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 69.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03089405782520771, "kl": 0.009803989436477423, "learning_rate": 3.4722222222222224e-06, "loss": 0.0005, "num_tokens": 1125698.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.315831661224365, "kl": 0.041498917154967785, "learning_rate": 3.471666666666667e-06, "loss": 0.1185, "num_tokens": 1125988.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 69.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03474978357553482, "kl": 0.04647521674633026, "learning_rate": 3.471111111111111e-06, "loss": 0.0023, "num_tokens": 1126435.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 69.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07999103516340256, "kl": 0.0793612077832222, "learning_rate": 3.470555555555556e-06, "loss": 0.004, "num_tokens": 1126809.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 69.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.029926080256700516, "kl": 0.0008181408047676086, "learning_rate": 3.4700000000000002e-06, "loss": 0.0, "num_tokens": 1127021.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 69.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 5.334052085876465, "kl": 0.06425761803984642, "learning_rate": 3.4694444444444446e-06, "loss": 0.2684, "num_tokens": 1127259.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 3756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 69.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.4420582056045532, "kl": 0.04377244645729661, "learning_rate": 3.4688888888888894e-06, "loss": 0.0693, "num_tokens": 1127587.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 69.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.1111113429069519, "kl": 0.05746060982346535, "learning_rate": 3.4683333333333337e-06, "loss": 0.0031, "num_tokens": 1127863.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 69.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03136065602302551, "kl": 0.0025132453883998096, "learning_rate": 3.467777777777778e-06, "loss": 0.0001, "num_tokens": 1128181.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04715379327535629, "kl": 0.01895292429253459, "learning_rate": 3.4672222222222224e-06, "loss": 0.0009, "num_tokens": 1128471.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.006311085075139999, "kl": 0.0002531962818466127, "learning_rate": 3.4666666666666672e-06, "loss": 0.0, "num_tokens": 1128751.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 69.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.01902838423848152, "kl": 0.0053176876390352845, "learning_rate": 3.466111111111111e-06, "loss": 0.0003, "num_tokens": 1129055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 69.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.005079538561403751, "kl": 0.001172947813756764, "learning_rate": 3.465555555555556e-06, "loss": 0.0001, "num_tokens": 1129275.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 69.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06676094979047775, "kl": 0.007788576185703278, "learning_rate": 3.465e-06, "loss": 0.0005, "num_tokens": 1129529.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 69.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03098052367568016, "kl": 0.0031139015191001818, "learning_rate": 3.4644444444444446e-06, "loss": 0.0002, "num_tokens": 1129785.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 69.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.4245657920837402, "kl": 0.010215928312391043, "learning_rate": 3.4638888888888894e-06, "loss": 0.0524, "num_tokens": 1130056.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 69.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010693544027162716, "kl": 9.052455425262451e-06, "learning_rate": 3.4633333333333333e-06, "loss": 0.0, "num_tokens": 1130276.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 69.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.011909924447536469, "kl": 0.2141888290643692, "learning_rate": 3.462777777777778e-06, "loss": 0.0107, "num_tokens": 1130580.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 69.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.30587613582611084, "kl": 0.06303915288299322, "learning_rate": 3.4622222222222225e-06, "loss": 0.0032, "num_tokens": 1130832.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 69.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.010042497888207436, "kl": 0.00022950422135181725, "learning_rate": 3.4616666666666673e-06, "loss": 0.0, "num_tokens": 1131102.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 69.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.1514289379119873, "kl": 0.06050388514995575, "learning_rate": 3.461111111111111e-06, "loss": -0.0562, "num_tokens": 1131422.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 69.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.4321258068084717, "kl": 0.9023962318897247, "learning_rate": 3.460555555555556e-06, "loss": 0.0381, "num_tokens": 1131752.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 69.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 2.875001907348633, "kl": 0.32970716059207916, "learning_rate": 3.46e-06, "loss": 0.0151, "num_tokens": 1132058.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 69.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.5054779052734375, "kl": 0.0242116404697299, "learning_rate": 3.4594444444444447e-06, "loss": 0.1542, "num_tokens": 1132393.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 69.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.16388331353664398, "kl": 0.02547937212511897, "learning_rate": 3.4588888888888895e-06, "loss": 0.0014, "num_tokens": 1132725.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 69.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.32951870560646057, "kl": 0.05503563955426216, "learning_rate": 3.4583333333333334e-06, "loss": 0.0028, "num_tokens": 1133043.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 69.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 5.294356346130371, "kl": 0.06782082468271255, "learning_rate": 3.457777777777778e-06, "loss": 0.0478, "num_tokens": 1133277.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 69.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.4755425453186035, "kl": 1.4691326401662081, "learning_rate": 3.4572222222222225e-06, "loss": 0.0737, "num_tokens": 1133554.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 69.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.2152209281921387, "kl": 0.07385629322379827, "learning_rate": 3.456666666666667e-06, "loss": 0.0392, "num_tokens": 1133907.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 70.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01082912553101778, "kl": 0.0031849287915974855, "learning_rate": 3.4561111111111112e-06, "loss": 0.0002, "num_tokens": 1134163.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 70.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0635760948061943, "kl": 0.005294523463817313, "learning_rate": 3.455555555555556e-06, "loss": 0.0003, "num_tokens": 1134440.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 70.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.76171875, "kl": 0.14856815338134766, "learning_rate": 3.455e-06, "loss": 0.1083, "num_tokens": 1134763.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 70.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.030287928879261017, "kl": 0.042453017085790634, "learning_rate": 3.4544444444444447e-06, "loss": 0.0021, "num_tokens": 1135219.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 70.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00833990890532732, "kl": 0.0021722452947869897, "learning_rate": 3.4538888888888895e-06, "loss": 0.0001, "num_tokens": 1135515.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 70.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.2565507888793945, "kl": 0.1806437000632286, "learning_rate": 3.4533333333333334e-06, "loss": 0.3509, "num_tokens": 1135889.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 3785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 70.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.7328623533248901, "kl": 0.3757609431631863, "learning_rate": 3.4527777777777782e-06, "loss": 0.01, "num_tokens": 1136210.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 70.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08268975466489792, "kl": 0.019754558801651, "learning_rate": 3.452222222222222e-06, "loss": 0.001, "num_tokens": 1136482.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 70.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.316976398229599, "kl": 0.028150849044322968, "learning_rate": 3.451666666666667e-06, "loss": 0.0014, "num_tokens": 1136742.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 70.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 6.2353901863098145, "kl": 0.027069970208685845, "learning_rate": 3.4511111111111113e-06, "loss": 0.2063, "num_tokens": 1137016.0, "reward": 2.25, "reward_std": 2.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 2.5, "step": 3789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 70.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.13410010933876038, "kl": 0.02956196293234825, "learning_rate": 3.450555555555556e-06, "loss": 0.0015, "num_tokens": 1137278.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 70.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.015089038759469986, "kl": 0.00039424299029633403, "learning_rate": 3.45e-06, "loss": 0.0, "num_tokens": 1137534.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 70.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.010664639063179493, "kl": 0.01605843473225832, "learning_rate": 3.4494444444444448e-06, "loss": 0.0008, "num_tokens": 1137794.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 70.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.494248390197754, "kl": 0.2008618414402008, "learning_rate": 3.4488888888888896e-06, "loss": -0.099, "num_tokens": 1138169.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 70.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.05435778573155403, "kl": 0.02039443887770176, "learning_rate": 3.4483333333333335e-06, "loss": 0.0011, "num_tokens": 1138459.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 70.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.4853663444519043, "kl": 0.054243847727775574, "learning_rate": 3.4477777777777783e-06, "loss": 0.1784, "num_tokens": 1138865.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 70.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.013017385266721249, "kl": 0.0004042436776217073, "learning_rate": 3.447222222222222e-06, "loss": 0.0, "num_tokens": 1139137.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 70.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.557891845703125, "kl": 0.04111720249056816, "learning_rate": 3.446666666666667e-06, "loss": 0.1506, "num_tokens": 1139481.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 3797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 70.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.018838588148355484, "kl": 0.0014369882992468774, "learning_rate": 3.4461111111111113e-06, "loss": 0.0001, "num_tokens": 1139802.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 70.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009268294088542461, "kl": 0.003915768931619823, "learning_rate": 3.4455555555555557e-06, "loss": 0.0002, "num_tokens": 1140070.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 70.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10681109875440598, "kl": 0.017109781969338655, "learning_rate": 3.445e-06, "loss": 0.0008, "num_tokens": 1140400.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 70.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03087480738759041, "kl": 0.008256811648607254, "learning_rate": 3.444444444444445e-06, "loss": 0.0004, "num_tokens": 1140712.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 70.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.35752010345459, "kl": 0.3559943586587906, "learning_rate": 3.443888888888889e-06, "loss": -0.0262, "num_tokens": 1141086.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 3802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 70.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.013541961088776588, "kl": 0.03382910043001175, "learning_rate": 3.4433333333333335e-06, "loss": 0.0017, "num_tokens": 1141302.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 70.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09820692986249924, "kl": 0.024656977504491806, "learning_rate": 3.4427777777777783e-06, "loss": 0.0012, "num_tokens": 1141601.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 70.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05644148215651512, "kl": 0.0049325525760650635, "learning_rate": 3.4422222222222223e-06, "loss": 0.0002, "num_tokens": 1141813.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 70.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011271451512584463, "kl": 9.894371032714844e-06, "learning_rate": 3.441666666666667e-06, "loss": 0.0, "num_tokens": 1142033.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 70.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.00793568417429924, "kl": 0.001602793054189533, "learning_rate": 3.4411111111111114e-06, "loss": 0.0001, "num_tokens": 1142297.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 70.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02303546480834484, "kl": 0.009728140663355589, "learning_rate": 3.4405555555555557e-06, "loss": 0.0005, "num_tokens": 1142575.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 70.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.11369010806083679, "kl": 0.17427853494882584, "learning_rate": 3.44e-06, "loss": 0.0087, "num_tokens": 1142885.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 70.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05241650715470314, "kl": 0.03259041905403137, "learning_rate": 3.439444444444445e-06, "loss": 0.0016, "num_tokens": 1143174.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 70.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04065488278865814, "kl": 0.0009740963578224182, "learning_rate": 3.4388888888888892e-06, "loss": 0.0, "num_tokens": 1143386.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 70.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.041840922087430954, "kl": 0.028884374536573887, "learning_rate": 3.4383333333333336e-06, "loss": 0.0015, "num_tokens": 1143656.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 70.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014570853672921658, "kl": 0.09055894613265991, "learning_rate": 3.4377777777777784e-06, "loss": 0.0045, "num_tokens": 1144020.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 70.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04771750420331955, "kl": 0.028682168573141098, "learning_rate": 3.4372222222222223e-06, "loss": 0.0014, "num_tokens": 1144344.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 70.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10082825273275375, "kl": 0.017951634246855974, "learning_rate": 3.436666666666667e-06, "loss": 0.0012, "num_tokens": 1144670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 70.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.01559057179838419, "kl": 0.0015416272217407823, "learning_rate": 3.436111111111111e-06, "loss": 0.0001, "num_tokens": 1144926.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 70.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.007489846087992191, "kl": 0.003273865324445069, "learning_rate": 3.435555555555556e-06, "loss": 0.0002, "num_tokens": 1145212.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 70.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.6663784980773926, "kl": 0.1759313941001892, "learning_rate": 3.4350000000000006e-06, "loss": -0.1698, "num_tokens": 1145526.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 70.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.1328656673431396, "kl": 0.05129479244351387, "learning_rate": 3.4344444444444445e-06, "loss": 0.0063, "num_tokens": 1145854.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 70.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.29829564690589905, "kl": 0.20272741466760635, "learning_rate": 3.4338888888888893e-06, "loss": 0.0096, "num_tokens": 1146177.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 70.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.03825705870985985, "kl": 0.01148970052599907, "learning_rate": 3.4333333333333336e-06, "loss": 0.0006, "num_tokens": 1146504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 70.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.041678570210933685, "kl": 0.0038032694719731808, "learning_rate": 3.4327777777777784e-06, "loss": 0.0002, "num_tokens": 1146739.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 70.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.31088536977767944, "kl": 0.07595939561724663, "learning_rate": 3.4322222222222223e-06, "loss": 0.0037, "num_tokens": 1147013.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 70.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.01090244296938181, "kl": 0.0008609190408606082, "learning_rate": 3.431666666666667e-06, "loss": 0.0, "num_tokens": 1147232.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012195121496915817, "clip_ratio/low_min": 0.012195121496915817, "clip_ratio/region_mean": 0.012195121496915817, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 70.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.61566162109375, "kl": 0.10689140483736992, "learning_rate": 3.431111111111111e-06, "loss": 0.061, "num_tokens": 1147578.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 70.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.000666158099193126, "kl": 0.0016053830040618777, "learning_rate": 3.430555555555556e-06, "loss": 0.0001, "num_tokens": 1147858.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 70.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0499817319214344, "kl": 0.005164772272109985, "learning_rate": 3.4300000000000006e-06, "loss": 0.0002, "num_tokens": 1148064.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 70.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03417159244418144, "kl": 0.014005969278514385, "learning_rate": 3.4294444444444445e-06, "loss": 0.0008, "num_tokens": 1148352.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 70.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.004631369840353727, "kl": 0.010416842997074127, "learning_rate": 3.4288888888888893e-06, "loss": 0.0005, "num_tokens": 1148588.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 70.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.12991218268871307, "kl": 0.01216417842078954, "learning_rate": 3.4283333333333337e-06, "loss": 0.0006, "num_tokens": 1148888.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 70.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035781587939709425, "kl": 0.000261109322309494, "learning_rate": 3.427777777777778e-06, "loss": 0.0, "num_tokens": 1149132.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 70.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08046979457139969, "kl": 0.008899566979380324, "learning_rate": 3.4272222222222224e-06, "loss": 0.0005, "num_tokens": 1149423.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.5, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 70.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.020296812057495, "kl": 0.08394738286733627, "learning_rate": 3.426666666666667e-06, "loss": -0.0945, "num_tokens": 1149837.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06824535876512527, "kl": 0.01452819723635912, "learning_rate": 3.426111111111111e-06, "loss": 0.0008, "num_tokens": 1150139.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011136698594782501, "kl": 9.879469871520996e-06, "learning_rate": 3.425555555555556e-06, "loss": 0.0, "num_tokens": 1150359.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 71.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.059903331100940704, "kl": 0.0063877300708554685, "learning_rate": 3.4250000000000007e-06, "loss": 0.0003, "num_tokens": 1150647.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 71.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04006200656294823, "kl": 0.0064985607750713825, "learning_rate": 3.4244444444444446e-06, "loss": 0.0003, "num_tokens": 1150955.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 71.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.9168401956558228, "kl": 0.02440529502928257, "learning_rate": 3.4238888888888894e-06, "loss": 0.456, "num_tokens": 1151470.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 3838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 71.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03477069362998009, "kl": 0.008861796464771032, "learning_rate": 3.4233333333333333e-06, "loss": 0.0004, "num_tokens": 1151778.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 71.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.008278573863208294, "kl": 0.0020523747662082314, "learning_rate": 3.422777777777778e-06, "loss": 0.0001, "num_tokens": 1152074.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 71.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007014566566795111, "kl": 0.0011814236640930176, "learning_rate": 3.4222222222222224e-06, "loss": 0.0001, "num_tokens": 1152286.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.015244378708302975, "kl": 0.00031526386737823486, "learning_rate": 3.4216666666666672e-06, "loss": 0.0, "num_tokens": 1152498.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 71.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.04649772495031357, "kl": 0.08637004345655441, "learning_rate": 3.421111111111111e-06, "loss": 0.0043, "num_tokens": 1152875.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 71.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.015183196403086185, "kl": 0.009407495148479939, "learning_rate": 3.420555555555556e-06, "loss": 0.0005, "num_tokens": 1153187.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 71.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.01027660071849823, "kl": 0.0007435402367264032, "learning_rate": 3.4200000000000007e-06, "loss": 0.0, "num_tokens": 1153467.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 71.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.8411624431610107, "kl": 0.15121914818882942, "learning_rate": 3.4194444444444446e-06, "loss": 0.0273, "num_tokens": 1153832.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 3846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.2369462549686432, "kl": 0.01964076646254398, "learning_rate": 3.4188888888888894e-06, "loss": 0.0009, "num_tokens": 1154051.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 71.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 0.9835092425346375, "kl": 0.0998019389808178, "learning_rate": 3.4183333333333334e-06, "loss": -0.066, "num_tokens": 1154499.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 3848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 71.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.6190552711486816, "kl": 0.016234721755608916, "learning_rate": 3.417777777777778e-06, "loss": 0.2323, "num_tokens": 1154855.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 71.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.017408663406968117, "kl": 0.001227468776050955, "learning_rate": 3.4172222222222225e-06, "loss": 0.0001, "num_tokens": 1155117.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 71.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.3501477241516113, "kl": 0.04416090250015259, "learning_rate": 3.416666666666667e-06, "loss": 0.045, "num_tokens": 1155425.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 71.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05895019695162773, "kl": 0.018695805221796036, "learning_rate": 3.416111111111111e-06, "loss": 0.0009, "num_tokens": 1155727.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005286484491080046, "kl": 0.002769030164927244, "learning_rate": 3.415555555555556e-06, "loss": 0.0001, "num_tokens": 1156011.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 71.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.015191629528999329, "kl": 0.014944656286388636, "learning_rate": 3.4150000000000003e-06, "loss": 0.0007, "num_tokens": 1156271.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 71.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03128933534026146, "kl": 0.0013109196443110704, "learning_rate": 3.4144444444444447e-06, "loss": 0.0001, "num_tokens": 1156541.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 71.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.13558846712112427, "kl": 0.012867653043940663, "learning_rate": 3.4138888888888895e-06, "loss": 0.0007, "num_tokens": 1156814.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 71.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.026167450472712517, "kl": 0.006947818212211132, "learning_rate": 3.4133333333333334e-06, "loss": 0.0003, "num_tokens": 1157152.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 71.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.01097855344414711, "kl": 0.0020039306837134063, "learning_rate": 3.412777777777778e-06, "loss": 0.0001, "num_tokens": 1157423.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 71.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.034478191286325455, "kl": 0.008569823112338781, "learning_rate": 3.412222222222222e-06, "loss": 0.0004, "num_tokens": 1157748.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 71.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.10378371179103851, "kl": 0.01308836042881012, "learning_rate": 3.411666666666667e-06, "loss": 0.0007, "num_tokens": 1157992.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 100.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 100.5, "completions/mean_terminated_length": 48.66666793823242, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 71.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.3667340278625488, "kl": 0.09268810972571373, "learning_rate": 3.4111111111111113e-06, "loss": 0.372, "num_tokens": 1158614.0, "reward": 3.75, "reward_std": 4.9749369621276855, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 4.9749369621276855, "step": 3861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04395792633295059, "kl": 0.029500206001102924, "learning_rate": 3.410555555555556e-06, "loss": 0.0015, "num_tokens": 1158905.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 71.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859733372926712, "kl": 0.24849341809749603, "learning_rate": 3.4100000000000004e-06, "loss": 0.0123, "num_tokens": 1159205.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 71.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.3345863819122314, "kl": 0.10893995314836502, "learning_rate": 3.4094444444444447e-06, "loss": -0.1269, "num_tokens": 1159517.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 71.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.504992961883545, "kl": 0.1751508042216301, "learning_rate": 3.4088888888888895e-06, "loss": -0.0071, "num_tokens": 1159829.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 71.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.020197853446006775, "kl": 0.0031101732747629285, "learning_rate": 3.4083333333333335e-06, "loss": 0.0002, "num_tokens": 1160091.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 71.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.024449441581964493, "kl": 0.002855303493561223, "learning_rate": 3.4077777777777782e-06, "loss": 0.0001, "num_tokens": 1160405.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 71.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.12562575936317444, "kl": 0.051051460206508636, "learning_rate": 3.407222222222222e-06, "loss": 0.0028, "num_tokens": 1160678.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 71.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05844710022211075, "kl": 0.0203253710642457, "learning_rate": 3.406666666666667e-06, "loss": 0.001, "num_tokens": 1160946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 71.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.10286296904087067, "kl": 0.050290366634726524, "learning_rate": 3.4061111111111113e-06, "loss": 0.0025, "num_tokens": 1161288.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 71.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07436588406562805, "kl": 0.00603824044810608, "learning_rate": 3.4055555555555557e-06, "loss": 0.0003, "num_tokens": 1161522.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 71.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03788808733224869, "kl": 0.011120212264358997, "learning_rate": 3.4050000000000004e-06, "loss": 0.0006, "num_tokens": 1161808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 71.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04122086986899376, "kl": 0.02494671754539013, "learning_rate": 3.404444444444445e-06, "loss": 0.0012, "num_tokens": 1162136.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 71.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.048663269728422165, "kl": 0.15856093913316727, "learning_rate": 3.403888888888889e-06, "loss": 0.0078, "num_tokens": 1162463.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 71.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.03024672344326973, "kl": 0.029596254229545593, "learning_rate": 3.4033333333333335e-06, "loss": 0.0015, "num_tokens": 1162751.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 71.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.3106412887573242, "kl": 0.12441641092300415, "learning_rate": 3.4027777777777783e-06, "loss": 0.0066, "num_tokens": 1163129.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 71.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0926685556769371, "kl": 0.10348881036043167, "learning_rate": 3.4022222222222222e-06, "loss": 0.0052, "num_tokens": 1163436.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 71.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 4.590888977050781, "kl": 0.18631959706544876, "learning_rate": 3.401666666666667e-06, "loss": -0.0418, "num_tokens": 1163765.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.10011114180088043, "kl": 0.0109938383102417, "learning_rate": 3.4011111111111113e-06, "loss": 0.0003, "num_tokens": 1164013.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.4335857331752777, "kl": 0.07118495553731918, "learning_rate": 3.4005555555555557e-06, "loss": 0.0036, "num_tokens": 1164229.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 71.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.270288944244385, "kl": 0.007413439452648163, "learning_rate": 3.4000000000000005e-06, "loss": 0.084, "num_tokens": 1164492.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 71.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07305454462766647, "kl": 0.04010534659028053, "learning_rate": 3.399444444444445e-06, "loss": 0.0021, "num_tokens": 1164843.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 71.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005191559437662363, "kl": 0.010285407304763794, "learning_rate": 3.398888888888889e-06, "loss": 0.0005, "num_tokens": 1165079.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 71.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.098208427429199, "kl": 0.06854692474007607, "learning_rate": 3.3983333333333335e-06, "loss": 0.251, "num_tokens": 1165436.0, "reward": 4.5, "reward_std": 4.358899116516113, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.358899116516113, "step": 3884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 71.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.052033886313438416, "kl": 0.012186399661004543, "learning_rate": 3.3977777777777783e-06, "loss": 0.0006, "num_tokens": 1165708.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 71.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.027632495388388634, "kl": 0.0032838359475135803, "learning_rate": 3.3972222222222223e-06, "loss": 0.0002, "num_tokens": 1165916.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 71.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0748196542263031, "kl": 0.005394432329921983, "learning_rate": 3.396666666666667e-06, "loss": 0.0003, "num_tokens": 1166172.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 72.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04291499778628349, "kl": 0.023179568350315094, "learning_rate": 3.396111111111111e-06, "loss": 0.0011, "num_tokens": 1166501.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 72.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.025201044976711273, "kl": 0.0019228158053010702, "learning_rate": 3.3955555555555558e-06, "loss": 0.0001, "num_tokens": 1166823.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 72.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.12031056731939316, "kl": 0.034986975602805614, "learning_rate": 3.3950000000000005e-06, "loss": 0.002, "num_tokens": 1167169.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 72.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.009541605599224567, "kl": 0.0015886223409324884, "learning_rate": 3.3944444444444445e-06, "loss": 0.0001, "num_tokens": 1167465.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 72.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06009736284613609, "kl": 0.013208307325839996, "learning_rate": 3.3938888888888892e-06, "loss": 0.0007, "num_tokens": 1167725.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.3262009620666504, "kl": 0.026400122791528702, "learning_rate": 3.3933333333333336e-06, "loss": 0.212, "num_tokens": 1167979.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 3893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 72.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.7167489528656006, "kl": 0.014073840342462063, "learning_rate": 3.3927777777777784e-06, "loss": 0.0569, "num_tokens": 1168320.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 3894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004899331368505955, "kl": 0.010339170694351196, "learning_rate": 3.3922222222222223e-06, "loss": 0.0005, "num_tokens": 1168556.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 72.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014719521859660745, "kl": 0.09054877236485481, "learning_rate": 3.391666666666667e-06, "loss": 0.0045, "num_tokens": 1168920.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 72.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.06249469146132469, "kl": 0.15268462151288986, "learning_rate": 3.391111111111111e-06, "loss": 0.0074, "num_tokens": 1169243.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 72.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.019830452278256416, "kl": 0.0027261875802651048, "learning_rate": 3.390555555555556e-06, "loss": 0.0001, "num_tokens": 1169533.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 72.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.028914324939250946, "kl": 0.00789010594598949, "learning_rate": 3.3900000000000006e-06, "loss": 0.0004, "num_tokens": 1169837.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 72.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07218263298273087, "kl": 0.018855460919439793, "learning_rate": 3.3894444444444445e-06, "loss": 0.001, "num_tokens": 1170144.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 72.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03641039505600929, "kl": 0.06164906919002533, "learning_rate": 3.3888888888888893e-06, "loss": 0.0031, "num_tokens": 1170506.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 72.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.029392365366220474, "kl": 0.03291566204279661, "learning_rate": 3.3883333333333336e-06, "loss": 0.0017, "num_tokens": 1170795.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 72.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0735182985663414, "kl": 0.019092006608843803, "learning_rate": 3.387777777777778e-06, "loss": 0.001, "num_tokens": 1171081.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 72.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.003956756088882685, "kl": 0.0002965852618217468, "learning_rate": 3.3872222222222224e-06, "loss": 0.0, "num_tokens": 1171325.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 72.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03988475725054741, "kl": 0.027026942931115627, "learning_rate": 3.386666666666667e-06, "loss": 0.0014, "num_tokens": 1171737.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 72.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.11281057447195053, "kl": 0.05915150046348572, "learning_rate": 3.386111111111111e-06, "loss": 0.003, "num_tokens": 1172082.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.056197844445705414, "kl": 0.03479776531457901, "learning_rate": 3.385555555555556e-06, "loss": 0.0017, "num_tokens": 1172298.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3907 }, { "clip_ratio/high_max": 0.007575757801532745, "clip_ratio/high_mean": 0.007575757801532745, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007575757801532745, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 72.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.5853874683380127, "kl": 0.11416057124733925, "learning_rate": 3.3850000000000006e-06, "loss": -0.0421, "num_tokens": 1172662.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 3908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 72.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.18611478805542, "kl": 0.22550596296787262, "learning_rate": 3.3844444444444446e-06, "loss": 0.0655, "num_tokens": 1173048.0, "reward": 2.25, "reward_std": 3.947572946548462, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.947573184967041, "step": 3909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 72.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.28048849105835, "kl": 0.17118452489376068, "learning_rate": 3.3838888888888893e-06, "loss": -0.0687, "num_tokens": 1173350.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 72.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.0636924505233765, "kl": 0.523501965450123, "learning_rate": 3.3833333333333333e-06, "loss": 0.03, "num_tokens": 1173624.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 3911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 72.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05874933302402496, "kl": 0.05400611087679863, "learning_rate": 3.382777777777778e-06, "loss": 0.0027, "num_tokens": 1174073.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 72.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08533456176519394, "kl": 0.010557691566646099, "learning_rate": 3.3822222222222224e-06, "loss": 0.0007, "num_tokens": 1174358.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 72.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.00950641930103302, "kl": 0.0004475712776184082, "learning_rate": 3.381666666666667e-06, "loss": 0.0, "num_tokens": 1174594.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 72.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.023120131343603134, "kl": 0.010022151283919811, "learning_rate": 3.381111111111111e-06, "loss": 0.0005, "num_tokens": 1174867.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 72.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.584756374359131, "kl": 0.1723315641283989, "learning_rate": 3.380555555555556e-06, "loss": -0.0252, "num_tokens": 1175168.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 3916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.000132914719870314, "kl": 1.2964010238647461e-05, "learning_rate": 3.3800000000000007e-06, "loss": 0.0, "num_tokens": 1175388.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 72.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01056237705051899, "kl": 0.0002635195851325989, "learning_rate": 3.3794444444444446e-06, "loss": 0.0, "num_tokens": 1175600.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 72.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.958815813064575, "kl": 0.008103714790195227, "learning_rate": 3.3788888888888894e-06, "loss": -0.0037, "num_tokens": 1175866.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 72.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059830727986991405, "kl": 0.0027914689853787422, "learning_rate": 3.3783333333333333e-06, "loss": 0.0001, "num_tokens": 1176150.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 72.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03455163538455963, "kl": 0.010192228015512228, "learning_rate": 3.377777777777778e-06, "loss": 0.0005, "num_tokens": 1176477.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 72.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0426318533718586, "kl": 0.02502452116459608, "learning_rate": 3.3772222222222225e-06, "loss": 0.0012, "num_tokens": 1176805.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 72.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.005168336909264326, "kl": 0.000852328521432355, "learning_rate": 3.376666666666667e-06, "loss": 0.0, "num_tokens": 1177025.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 72.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.06763815879821777, "kl": 0.0031872421968728304, "learning_rate": 3.376111111111111e-06, "loss": 0.0002, "num_tokens": 1177234.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 72.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 5.997629165649414, "kl": 0.3611306697130203, "learning_rate": 3.375555555555556e-06, "loss": -0.0174, "num_tokens": 1177501.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 77.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 77.5, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 72.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.4127800464630127, "kl": 0.04035552591085434, "learning_rate": 3.3750000000000003e-06, "loss": 0.5451, "num_tokens": 1178031.0, "reward": 3.549999952316284, "reward_std": 4.702127456665039, "rewards/reward_combined/mean": 3.549999952316284, "rewards/reward_combined/std": 4.702127456665039, "step": 3926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 72.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.1575523167848587, "kl": 0.12786313146352768, "learning_rate": 3.3744444444444447e-06, "loss": 0.0064, "num_tokens": 1178365.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 72.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0503716841340065, "kl": 0.015614181756973267, "learning_rate": 3.3738888888888894e-06, "loss": 0.0008, "num_tokens": 1178633.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 72.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015685084508731961, "kl": 0.00020018815848743543, "learning_rate": 3.3733333333333334e-06, "loss": 0.0, "num_tokens": 1178889.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 72.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.9848273992538452, "kl": 0.0177675262093544, "learning_rate": 3.372777777777778e-06, "loss": -0.0012, "num_tokens": 1179173.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 72.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.4081580638885498, "kl": 0.07870077714323997, "learning_rate": 3.372222222222222e-06, "loss": 0.0034, "num_tokens": 1179469.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 72.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.04120433330535889, "kl": 0.004897338338196278, "learning_rate": 3.371666666666667e-06, "loss": 0.0003, "num_tokens": 1179727.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 72.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.351560592651367, "kl": 0.029560036957263947, "learning_rate": 3.371111111111111e-06, "loss": 0.0307, "num_tokens": 1180050.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 3933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 72.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.031134774908423424, "kl": 0.006816594395786524, "learning_rate": 3.370555555555556e-06, "loss": 0.0004, "num_tokens": 1180369.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 72.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11986535042524338, "kl": 0.007218859274871647, "learning_rate": 3.3700000000000003e-06, "loss": 0.0003, "num_tokens": 1180645.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 72.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.959419012069702, "kl": 0.15237130969762802, "learning_rate": 3.3694444444444447e-06, "loss": 0.0095, "num_tokens": 1180978.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 72.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01434636302292347, "kl": 0.0008839964866638184, "learning_rate": 3.3688888888888895e-06, "loss": 0.0, "num_tokens": 1181194.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 72.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.016225570812821388, "kl": 0.00918120238929987, "learning_rate": 3.3683333333333334e-06, "loss": 0.0005, "num_tokens": 1181506.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 72.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.16228145360946655, "kl": 0.04778813011944294, "learning_rate": 3.367777777777778e-06, "loss": 0.0021, "num_tokens": 1181824.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 72.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.017454959452152252, "kl": 0.01462017185986042, "learning_rate": 3.367222222222222e-06, "loss": 0.0007, "num_tokens": 1182084.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 72.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04757387191057205, "kl": 0.23166526854038239, "learning_rate": 3.366666666666667e-06, "loss": 0.0115, "num_tokens": 1182386.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 73.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.14170090854167938, "kl": 0.017828218638896942, "learning_rate": 3.3661111111111117e-06, "loss": 0.0008, "num_tokens": 1182649.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3942 }, { "clip_ratio/high_max": 0.0071428571827709675, "clip_ratio/high_mean": 0.0071428571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0071428571827709675, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 73.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.3645377159118652, "kl": 0.15533238649368286, "learning_rate": 3.3655555555555556e-06, "loss": -0.003, "num_tokens": 1183015.0, "reward": 5.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.674234628677368, "step": 3943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 73.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.040446873754262924, "kl": 0.0036596462596207857, "learning_rate": 3.3650000000000004e-06, "loss": 0.0002, "num_tokens": 1183285.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 73.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.007257416378706694, "kl": 0.00047691911458969116, "learning_rate": 3.3644444444444447e-06, "loss": 0.0, "num_tokens": 1183497.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.310818672180176, "kl": 0.05847947299480438, "learning_rate": 3.363888888888889e-06, "loss": 0.0011, "num_tokens": 1183797.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 3946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 73.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05682429298758507, "kl": 0.015052524395287037, "learning_rate": 3.3633333333333335e-06, "loss": 0.0009, "num_tokens": 1184065.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 73.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.18376107513904572, "kl": 0.04691579192876816, "learning_rate": 3.3627777777777782e-06, "loss": 0.0023, "num_tokens": 1184351.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 73.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02067289501428604, "kl": 0.01409248961135745, "learning_rate": 3.362222222222222e-06, "loss": 0.0007, "num_tokens": 1184682.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 73.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09705328196287155, "kl": 0.011530196759849787, "learning_rate": 3.361666666666667e-06, "loss": 0.0006, "num_tokens": 1184949.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 73.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.07971096783876419, "kl": 0.09875429421663284, "learning_rate": 3.3611111111111117e-06, "loss": 0.0047, "num_tokens": 1185265.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 73.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375443309545517, "kl": 0.029402881860733032, "learning_rate": 3.3605555555555557e-06, "loss": 0.0015, "num_tokens": 1185543.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 73.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.1592673361301422, "kl": 0.03334164619445801, "learning_rate": 3.3600000000000004e-06, "loss": 0.001, "num_tokens": 1185791.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 73.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02704855240881443, "kl": 0.005737133789807558, "learning_rate": 3.359444444444445e-06, "loss": 0.0003, "num_tokens": 1186123.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 73.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.04824978485703468, "kl": 0.039221037179231644, "learning_rate": 3.358888888888889e-06, "loss": 0.002, "num_tokens": 1186576.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 73.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 4.007014274597168, "kl": 0.5278907797764987, "learning_rate": 3.3583333333333335e-06, "loss": 0.0593, "num_tokens": 1186874.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 73.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.11603160947561264, "kl": 0.03765805810689926, "learning_rate": 3.3577777777777783e-06, "loss": 0.002, "num_tokens": 1187214.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 73.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 4.771048069000244, "kl": 0.033547437749803066, "learning_rate": 3.3572222222222222e-06, "loss": 0.1047, "num_tokens": 1187492.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 73.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.4846413135528564, "kl": 0.2544099912047386, "learning_rate": 3.356666666666667e-06, "loss": 0.0146, "num_tokens": 1187829.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 3959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 73.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.015000343322754, "kl": 0.01971594078349881, "learning_rate": 3.3561111111111118e-06, "loss": 0.0018, "num_tokens": 1188146.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 73.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.022218888625502586, "kl": 0.00136663019657135, "learning_rate": 3.3555555555555557e-06, "loss": 0.0001, "num_tokens": 1188402.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 73.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.013508559204638004, "kl": 0.015397094655781984, "learning_rate": 3.3550000000000005e-06, "loss": 0.0008, "num_tokens": 1188662.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 73.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.031541548669338226, "kl": 0.0091874694917351, "learning_rate": 3.3544444444444444e-06, "loss": 0.0004, "num_tokens": 1188953.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 73.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.46769851446151733, "kl": 0.059758687391877174, "learning_rate": 3.353888888888889e-06, "loss": 0.0036, "num_tokens": 1189304.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 73.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.007553958799690008, "kl": 0.001599833369255066, "learning_rate": 3.3533333333333336e-06, "loss": 0.0001, "num_tokens": 1189516.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 73.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.007547975983470678, "kl": 0.0004929580900352448, "learning_rate": 3.3527777777777783e-06, "loss": 0.0, "num_tokens": 1189804.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 73.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.31489634513855, "kl": 0.29789987206459045, "learning_rate": 3.3522222222222223e-06, "loss": 0.0389, "num_tokens": 1190114.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 73.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.0151541233062744, "kl": 0.07216551713645458, "learning_rate": 3.351666666666667e-06, "loss": 0.0455, "num_tokens": 1190472.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 3968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07839857786893845, "kl": 0.014736623503267765, "learning_rate": 3.351111111111112e-06, "loss": 0.0007, "num_tokens": 1190770.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 73.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01610853523015976, "kl": 0.21512508392333984, "learning_rate": 3.3505555555555558e-06, "loss": 0.0108, "num_tokens": 1191074.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 73.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.125868797302246, "kl": 0.044931743294000626, "learning_rate": 3.3500000000000005e-06, "loss": 0.0757, "num_tokens": 1191382.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 3971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.13348746299743652, "kl": 0.03784079663455486, "learning_rate": 3.3494444444444445e-06, "loss": 0.0019, "num_tokens": 1191670.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.013148307800293, "kl": 0.028943483717739582, "learning_rate": 3.3488888888888892e-06, "loss": -0.0102, "num_tokens": 1191961.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 3973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 73.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.6838208436965942, "kl": 0.058455463498830795, "learning_rate": 3.3483333333333336e-06, "loss": 0.0029, "num_tokens": 1192221.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 73.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.8894543647766113, "kl": 0.13217605464160442, "learning_rate": 3.347777777777778e-06, "loss": 0.0042, "num_tokens": 1192537.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 73.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.162251949310303, "kl": 0.011513160541653633, "learning_rate": 3.3472222222222223e-06, "loss": -0.0024, "num_tokens": 1192809.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 3976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 73.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.11351882666349411, "kl": 0.03035164065659046, "learning_rate": 3.346666666666667e-06, "loss": 0.0015, "num_tokens": 1193145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 73.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.39486223459243774, "kl": 0.04056547116488218, "learning_rate": 3.3461111111111114e-06, "loss": 0.0021, "num_tokens": 1193410.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 73.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.02097989246249199, "kl": 0.023174438625574112, "learning_rate": 3.345555555555556e-06, "loss": 0.0012, "num_tokens": 1193822.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 73.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0136579480022192, "kl": 0.0032924527768045664, "learning_rate": 3.3450000000000006e-06, "loss": 0.0002, "num_tokens": 1194106.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 73.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.022239889949560165, "kl": 0.009276566095650196, "learning_rate": 3.3444444444444445e-06, "loss": 0.0005, "num_tokens": 1194418.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 73.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.012366288341581821, "kl": 0.00033426881782361306, "learning_rate": 3.3438888888888893e-06, "loss": 0.0, "num_tokens": 1194674.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 73.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04241267219185829, "kl": 0.003366221208125353, "learning_rate": 3.3433333333333332e-06, "loss": 0.0002, "num_tokens": 1194908.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 73.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.08604507893323898, "kl": 0.006687596440315247, "learning_rate": 3.342777777777778e-06, "loss": 0.0003, "num_tokens": 1195152.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 73.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.41088253259658813, "kl": 0.05388759006746113, "learning_rate": 3.3422222222222224e-06, "loss": 0.0029, "num_tokens": 1195373.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 73.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 12.962249755859375, "kl": 2.9513736218214035, "learning_rate": 3.341666666666667e-06, "loss": 0.1491, "num_tokens": 1195737.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 3986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 73.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.004974805749952793, "kl": 0.010439351201057434, "learning_rate": 3.3411111111111115e-06, "loss": 0.0005, "num_tokens": 1195973.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 73.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06673316657543182, "kl": 0.0041991183534264565, "learning_rate": 3.340555555555556e-06, "loss": 0.0002, "num_tokens": 1196296.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 73.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001686336036073044, "kl": 2.8401613235473633e-05, "learning_rate": 3.3400000000000006e-06, "loss": 0.0, "num_tokens": 1196516.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 3989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 73.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.10056261718273163, "kl": 0.1014031171798706, "learning_rate": 3.3394444444444446e-06, "loss": 0.0051, "num_tokens": 1196850.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 73.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 11.211542129516602, "kl": 0.48674391955137253, "learning_rate": 3.3388888888888893e-06, "loss": 0.2475, "num_tokens": 1197072.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 3991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 73.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02914000302553177, "kl": 0.002079188823699951, "learning_rate": 3.3383333333333333e-06, "loss": 0.0001, "num_tokens": 1197278.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 3992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 73.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11437245458364487, "kl": 0.1554320976138115, "learning_rate": 3.337777777777778e-06, "loss": 0.0077, "num_tokens": 1197580.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 73.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.019703829661011696, "kl": 0.002852856763638556, "learning_rate": 3.3372222222222224e-06, "loss": 0.0001, "num_tokens": 1197857.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 73.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.9467029571533203, "kl": 0.024150094017386436, "learning_rate": 3.3366666666666668e-06, "loss": -0.0853, "num_tokens": 1198131.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 3995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 74.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.2593822479248047, "kl": 0.24792253226041794, "learning_rate": 3.3361111111111115e-06, "loss": 0.0471, "num_tokens": 1198500.0, "reward": 5.0, "reward_std": 3.5590262413024902, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.5590262413024902, "step": 3996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 74.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08269819617271423, "kl": 0.1655803918838501, "learning_rate": 3.335555555555556e-06, "loss": 0.0083, "num_tokens": 1198809.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 74.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.26854613423347473, "kl": 0.025278877088567242, "learning_rate": 3.3350000000000003e-06, "loss": 0.0015, "num_tokens": 1199031.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 3998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06885666400194168, "kl": 0.014779896475374699, "learning_rate": 3.3344444444444446e-06, "loss": 0.0007, "num_tokens": 1199327.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 3999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 74.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.872894287109375, "kl": 0.02186770481057465, "learning_rate": 3.3338888888888894e-06, "loss": 0.0374, "num_tokens": 1199656.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 74.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.005192930344492197, "kl": 0.010389044880867004, "learning_rate": 3.3333333333333333e-06, "loss": 0.0005, "num_tokens": 1199892.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 74.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.016290420666337013, "kl": 0.01501120114699006, "learning_rate": 3.332777777777778e-06, "loss": 0.0008, "num_tokens": 1200152.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 74.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.11221499741077423, "kl": 0.02593704592436552, "learning_rate": 3.3322222222222225e-06, "loss": 0.0013, "num_tokens": 1200456.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 74.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015515493578277528, "kl": 4.114210605621338e-05, "learning_rate": 3.331666666666667e-06, "loss": 0.0, "num_tokens": 1200676.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 74.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.023159898817539215, "kl": 0.122176393866539, "learning_rate": 3.3311111111111116e-06, "loss": 0.0061, "num_tokens": 1201008.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 74.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.007577680982649326, "kl": 0.08863439038395882, "learning_rate": 3.330555555555556e-06, "loss": 0.0044, "num_tokens": 1201372.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 74.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.590939998626709, "kl": 0.15638994425535202, "learning_rate": 3.3300000000000003e-06, "loss": 0.0302, "num_tokens": 1201744.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 74.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01615818217396736, "kl": 0.009385055862367153, "learning_rate": 3.3294444444444447e-06, "loss": 0.0005, "num_tokens": 1202056.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 1.617246150970459, "kl": 0.26627634558826685, "learning_rate": 3.3288888888888894e-06, "loss": 0.0145, "num_tokens": 1202339.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 74.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.036295581609010696, "kl": 0.01636707130819559, "learning_rate": 3.3283333333333334e-06, "loss": 0.0009, "num_tokens": 1202606.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 74.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06607616692781448, "kl": 0.006745595484972, "learning_rate": 3.327777777777778e-06, "loss": 0.0003, "num_tokens": 1202866.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.02435479126870632, "kl": 0.005543069215491414, "learning_rate": 3.327222222222222e-06, "loss": 0.0003, "num_tokens": 1203150.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 74.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.033224694430828094, "kl": 0.029877218417823315, "learning_rate": 3.326666666666667e-06, "loss": 0.0015, "num_tokens": 1203616.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 74.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.003180723637342453, "kl": 0.003118655877187848, "learning_rate": 3.3261111111111116e-06, "loss": 0.0002, "num_tokens": 1203912.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.8732144832611084, "kl": 0.09823381900787354, "learning_rate": 3.3255555555555556e-06, "loss": 0.0354, "num_tokens": 1204201.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 74.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03404972329735756, "kl": 0.24099522829055786, "learning_rate": 3.3250000000000004e-06, "loss": 0.012, "num_tokens": 1204501.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 74.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04681539162993431, "kl": 0.03232499037403613, "learning_rate": 3.3244444444444447e-06, "loss": 0.0016, "num_tokens": 1204789.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.062376681715250015, "kl": 0.010275878012180328, "learning_rate": 3.3238888888888895e-06, "loss": 0.0005, "num_tokens": 1205091.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 74.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.018002044409513474, "kl": 0.0014541854616254568, "learning_rate": 3.3233333333333334e-06, "loss": 0.0001, "num_tokens": 1205411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 74.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.33702516555786133, "kl": 0.05680888332426548, "learning_rate": 3.322777777777778e-06, "loss": 0.0029, "num_tokens": 1205764.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012820512987673283, "clip_ratio/low_min": 0.012820512987673283, "clip_ratio/region_mean": 0.012820512987673283, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 74.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.7214787006378174, "kl": 0.2126779928803444, "learning_rate": 3.322222222222222e-06, "loss": 0.0126, "num_tokens": 1206072.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 4021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 74.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.1175900474190712, "kl": 0.004062280058860779, "learning_rate": 3.321666666666667e-06, "loss": 0.0002, "num_tokens": 1206282.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 74.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03765647113323212, "kl": 0.01768470648676157, "learning_rate": 3.3211111111111117e-06, "loss": 0.0008, "num_tokens": 1206608.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 74.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.1209040880203247, "kl": 0.3070084485807456, "learning_rate": 3.3205555555555556e-06, "loss": 0.1591, "num_tokens": 1206875.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 74.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08991292119026184, "kl": 0.0303479777649045, "learning_rate": 3.3200000000000004e-06, "loss": 0.0016, "num_tokens": 1207146.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 74.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.19015535712242126, "kl": 0.03693551942706108, "learning_rate": 3.3194444444444448e-06, "loss": 0.0019, "num_tokens": 1207419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 74.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.9143428802490234, "kl": 0.08365915901958942, "learning_rate": 3.318888888888889e-06, "loss": -0.1511, "num_tokens": 1207775.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 74.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03595214709639549, "kl": 0.002060944854747504, "learning_rate": 3.3183333333333335e-06, "loss": 0.0001, "num_tokens": 1208010.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 74.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.14230121672153473, "kl": 0.09176766499876976, "learning_rate": 3.3177777777777782e-06, "loss": 0.0046, "num_tokens": 1208369.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 74.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076397801749408245, "kl": 0.0006276071071624756, "learning_rate": 3.317222222222222e-06, "loss": 0.0, "num_tokens": 1208613.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 74.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.116469144821167, "kl": 0.11962315812706947, "learning_rate": 3.316666666666667e-06, "loss": 0.0545, "num_tokens": 1208950.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 4031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 74.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.08608485013246536, "kl": 0.007651162100955844, "learning_rate": 3.3161111111111117e-06, "loss": 0.0004, "num_tokens": 1209210.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 74.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.00791738461703062, "kl": 0.002528382115997374, "learning_rate": 3.3155555555555557e-06, "loss": 0.0001, "num_tokens": 1209490.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 74.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.007372970227152109, "kl": 0.0015071183443069458, "learning_rate": 3.3150000000000004e-06, "loss": 0.0001, "num_tokens": 1209702.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 74.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.10613296180963516, "kl": 0.01960195042192936, "learning_rate": 3.3144444444444444e-06, "loss": 0.0011, "num_tokens": 1209978.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 74.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017119806725531816, "kl": 0.001815189141780138, "learning_rate": 3.313888888888889e-06, "loss": 0.0001, "num_tokens": 1210258.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 74.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.13635367155075073, "kl": 0.010058204177767038, "learning_rate": 3.3133333333333335e-06, "loss": 0.0005, "num_tokens": 1210514.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010204081423580647, "clip_ratio/low_min": 0.010204081423580647, "clip_ratio/region_mean": 0.010204081423580647, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 74.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.568178653717041, "kl": 0.0824458571150899, "learning_rate": 3.3127777777777783e-06, "loss": 0.0956, "num_tokens": 1210842.0, "reward": 4.25, "reward_std": 4.051748752593994, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 4.051748752593994, "step": 4038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 74.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.4567437171936035, "kl": 0.2124757245182991, "learning_rate": 3.3122222222222222e-06, "loss": 0.0104, "num_tokens": 1211199.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 74.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03298242390155792, "kl": 0.0005982607544865459, "learning_rate": 3.311666666666667e-06, "loss": 0.0, "num_tokens": 1211412.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 74.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0184711255133152, "kl": 0.0002942055434687063, "learning_rate": 3.3111111111111118e-06, "loss": 0.0, "num_tokens": 1211668.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 74.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04969555512070656, "kl": 0.008443485479801893, "learning_rate": 3.3105555555555557e-06, "loss": 0.0004, "num_tokens": 1211956.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 74.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1268763393163681, "kl": 0.0076858134707435966, "learning_rate": 3.3100000000000005e-06, "loss": 0.0004, "num_tokens": 1212224.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 74.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.08993561565876007, "kl": 0.026485209353268147, "learning_rate": 3.3094444444444444e-06, "loss": 0.0013, "num_tokens": 1212557.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 74.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 1.8260369300842285, "kl": 0.32615216076374054, "learning_rate": 3.308888888888889e-06, "loss": 0.0163, "num_tokens": 1212855.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 74.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.15739908814430237, "kl": 0.031192888505756855, "learning_rate": 3.3083333333333336e-06, "loss": 0.0016, "num_tokens": 1213174.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 74.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06872253119945526, "kl": 0.04670047573745251, "learning_rate": 3.307777777777778e-06, "loss": 0.0019, "num_tokens": 1213559.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 74.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.4636685252189636, "kl": 0.08773758262395859, "learning_rate": 3.3072222222222223e-06, "loss": 0.0049, "num_tokens": 1213831.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 74.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.007705557160079479, "kl": 0.0026109731988981366, "learning_rate": 3.306666666666667e-06, "loss": 0.0001, "num_tokens": 1214143.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01017053797841072, "kl": 0.032810866832733154, "learning_rate": 3.3061111111111114e-06, "loss": 0.0016, "num_tokens": 1214359.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 75.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02346186153590679, "kl": 0.00527399784186855, "learning_rate": 3.3055555555555558e-06, "loss": 0.0003, "num_tokens": 1214652.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 7.564380645751953, "kl": 0.023263100534677505, "learning_rate": 3.3050000000000005e-06, "loss": 0.2868, "num_tokens": 1214876.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 75.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.035077325999736786, "kl": 0.007222274551168084, "learning_rate": 3.3044444444444445e-06, "loss": 0.0004, "num_tokens": 1215164.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 75.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06888289004564285, "kl": 0.005652182502672076, "learning_rate": 3.3038888888888893e-06, "loss": 0.0003, "num_tokens": 1215418.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 75.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.26921290159225464, "kl": 0.05348587594926357, "learning_rate": 3.303333333333333e-06, "loss": 0.0026, "num_tokens": 1215704.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 75.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 8.415608406066895, "kl": 0.11716141551733017, "learning_rate": 3.302777777777778e-06, "loss": 0.3161, "num_tokens": 1215918.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.75, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 75.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.063485860824585, "kl": 0.12016160413622856, "learning_rate": 3.3022222222222223e-06, "loss": 0.0462, "num_tokens": 1216333.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 75.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.3847489655017853, "kl": 0.0788373937830329, "learning_rate": 3.301666666666667e-06, "loss": 0.0041, "num_tokens": 1216619.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 75.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.04803955927491188, "kl": 0.013694403227418661, "learning_rate": 3.3011111111111115e-06, "loss": 0.0007, "num_tokens": 1216906.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 75.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03460414335131645, "kl": 0.07568293809890747, "learning_rate": 3.300555555555556e-06, "loss": 0.0038, "num_tokens": 1217275.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 75.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.05012766644358635, "kl": 0.010839865542948246, "learning_rate": 3.3000000000000006e-06, "loss": 0.0006, "num_tokens": 1217577.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 75.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01722618192434311, "kl": 0.2273162603378296, "learning_rate": 3.2994444444444445e-06, "loss": 0.0113, "num_tokens": 1217879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.004520075395703316, "kl": 0.01049642264842987, "learning_rate": 3.2988888888888893e-06, "loss": 0.0005, "num_tokens": 1218115.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 75.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 4.651469707489014, "kl": 0.118059191852808, "learning_rate": 3.2983333333333332e-06, "loss": 0.3008, "num_tokens": 1218502.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 4064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 75.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031788728665560484, "kl": 0.0033016460947692394, "learning_rate": 3.297777777777778e-06, "loss": 0.0002, "num_tokens": 1218798.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 75.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.015162953175604343, "kl": 0.012546357698738575, "learning_rate": 3.297222222222223e-06, "loss": 0.0006, "num_tokens": 1219070.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 75.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.046172596514225006, "kl": 0.017715079709887505, "learning_rate": 3.2966666666666667e-06, "loss": 0.0009, "num_tokens": 1219399.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 75.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.578463315963745, "kl": 0.09744481928646564, "learning_rate": 3.2961111111111115e-06, "loss": 0.0002, "num_tokens": 1219711.0, "reward": 4.375, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 3.902456521987915, "step": 4068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 75.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09683169424533844, "kl": 0.16070759296417236, "learning_rate": 3.295555555555556e-06, "loss": 0.008, "num_tokens": 1220021.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 75.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.008953607641160488, "kl": 0.08814912289381027, "learning_rate": 3.2950000000000002e-06, "loss": 0.0044, "num_tokens": 1220385.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 75.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.15302401781082153, "kl": 0.02249889774248004, "learning_rate": 3.2944444444444446e-06, "loss": 0.0011, "num_tokens": 1220662.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 75.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.11694760620594025, "kl": 0.08353347331285477, "learning_rate": 3.2938888888888894e-06, "loss": 0.0041, "num_tokens": 1221000.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 75.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.11617167294025421, "kl": 0.10599019378423691, "learning_rate": 3.2933333333333333e-06, "loss": 0.0053, "num_tokens": 1221310.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 75.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.16245155036449432, "kl": 0.029377606697380543, "learning_rate": 3.292777777777778e-06, "loss": 0.0015, "num_tokens": 1221627.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 75.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.160627365112305, "kl": 0.0462261326611042, "learning_rate": 3.292222222222223e-06, "loss": 0.0111, "num_tokens": 1221924.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 8.668574333190918, "kl": 0.022542424499988556, "learning_rate": 3.2916666666666668e-06, "loss": -0.161, "num_tokens": 1222164.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 4076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 75.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0367329940199852, "kl": 0.002324555185623467, "learning_rate": 3.2911111111111116e-06, "loss": 0.0001, "num_tokens": 1222428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 75.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0807008370757103, "kl": 0.013674364192411304, "learning_rate": 3.290555555555556e-06, "loss": 0.0008, "num_tokens": 1222702.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 75.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05105382576584816, "kl": 0.01959164347499609, "learning_rate": 3.2900000000000003e-06, "loss": 0.001, "num_tokens": 1222970.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.3196834325790405, "kl": 0.055679429322481155, "learning_rate": 3.2894444444444446e-06, "loss": 0.0024, "num_tokens": 1223212.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 75.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038683239836245775, "kl": 0.0004380285827210173, "learning_rate": 3.2888888888888894e-06, "loss": 0.0, "num_tokens": 1223432.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 75.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.007315284572541714, "kl": 0.0016946643590927124, "learning_rate": 3.2883333333333333e-06, "loss": 0.0001, "num_tokens": 1223644.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 75.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.349374532699585, "kl": 0.06989583559334278, "learning_rate": 3.287777777777778e-06, "loss": 0.036, "num_tokens": 1223963.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 75.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0948900580406189, "kl": 0.025012478232383728, "learning_rate": 3.287222222222223e-06, "loss": 0.0012, "num_tokens": 1224226.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 75.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04658322036266327, "kl": 0.0037398752756416798, "learning_rate": 3.286666666666667e-06, "loss": 0.0002, "num_tokens": 1224504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 75.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.01674666441977024, "kl": 0.014946699142456055, "learning_rate": 3.2861111111111116e-06, "loss": 0.0007, "num_tokens": 1224764.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 75.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.007941244170069695, "kl": 0.00037900109600741416, "learning_rate": 3.2855555555555555e-06, "loss": 0.0, "num_tokens": 1225036.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 75.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.030849065631628036, "kl": 0.028511147014796734, "learning_rate": 3.2850000000000003e-06, "loss": 0.0014, "num_tokens": 1225381.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 75.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.2174273282289505, "kl": 0.028126472607254982, "learning_rate": 3.2844444444444447e-06, "loss": 0.0016, "num_tokens": 1225700.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 75.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02165832929313183, "kl": 0.0034261951223015785, "learning_rate": 3.2838888888888894e-06, "loss": 0.0002, "num_tokens": 1226012.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 75.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.06131383776664734, "kl": 0.003591170650906861, "learning_rate": 3.2833333333333334e-06, "loss": 0.0002, "num_tokens": 1226245.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 75.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03986046090722084, "kl": 0.0007848815439501777, "learning_rate": 3.282777777777778e-06, "loss": 0.0, "num_tokens": 1226502.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 75.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.10877567529678345, "kl": 0.0791592001914978, "learning_rate": 3.282222222222223e-06, "loss": 0.004, "num_tokens": 1226808.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 75.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.071029894053936, "kl": 0.006761729717254639, "learning_rate": 3.281666666666667e-06, "loss": 0.0003, "num_tokens": 1227068.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015556688595097512, "kl": 4.144012928009033e-05, "learning_rate": 3.2811111111111116e-06, "loss": 0.0, "num_tokens": 1227288.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010638297535479069, "clip_ratio/low_min": 0.010638297535479069, "clip_ratio/region_mean": 0.010638297535479069, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 75.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.9032843112945557, "kl": 0.09674993716180325, "learning_rate": 3.2805555555555556e-06, "loss": 0.1001, "num_tokens": 1227641.0, "reward": 4.125, "reward_std": 2.25, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 2.25, "step": 4096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 75.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.3427618741989136, "kl": 0.2250650580972433, "learning_rate": 3.2800000000000004e-06, "loss": 0.0257, "num_tokens": 1227971.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 75.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.061981432139873505, "kl": 0.13651008531451225, "learning_rate": 3.2794444444444447e-06, "loss": 0.0068, "num_tokens": 1228307.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 75.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.035322025418281555, "kl": 0.022647385485470295, "learning_rate": 3.278888888888889e-06, "loss": 0.0011, "num_tokens": 1228629.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 75.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0613938570022583, "kl": 0.016938024200499058, "learning_rate": 3.2783333333333334e-06, "loss": 0.0008, "num_tokens": 1228900.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 75.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.448820114135742, "kl": 0.036810910794883966, "learning_rate": 3.277777777777778e-06, "loss": 0.0007, "num_tokens": 1229184.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 75.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.013677169568836689, "kl": 0.03275100141763687, "learning_rate": 3.2772222222222226e-06, "loss": 0.0016, "num_tokens": 1229400.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 75.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.7039743661880493, "kl": 0.4331730492413044, "learning_rate": 3.276666666666667e-06, "loss": 0.0161, "num_tokens": 1229848.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 4103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 76.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09243586659431458, "kl": 0.03072173520922661, "learning_rate": 3.2761111111111117e-06, "loss": 0.0016, "num_tokens": 1230205.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 76.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07434765994548798, "kl": 0.01703722681850195, "learning_rate": 3.2755555555555556e-06, "loss": 0.0009, "num_tokens": 1230479.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 76.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07635124772787094, "kl": 0.021010359283536673, "learning_rate": 3.2750000000000004e-06, "loss": 0.0011, "num_tokens": 1230766.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 76.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01169971190392971, "kl": 0.03252864629030228, "learning_rate": 3.2744444444444443e-06, "loss": 0.0016, "num_tokens": 1230982.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 76.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03919696807861328, "kl": 0.022837942466139793, "learning_rate": 3.273888888888889e-06, "loss": 0.0011, "num_tokens": 1231308.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 76.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.107180118560791, "kl": 0.13403139263391495, "learning_rate": 3.2733333333333335e-06, "loss": 0.0672, "num_tokens": 1231621.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 76.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.10169100016355515, "kl": 0.013271224219352007, "learning_rate": 3.2727777777777783e-06, "loss": 0.0007, "num_tokens": 1231891.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 76.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.736695289611816, "kl": 0.33187004923820496, "learning_rate": 3.2722222222222226e-06, "loss": 0.0152, "num_tokens": 1232178.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 4111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 76.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.009385540150105953, "kl": 0.0009637296025175601, "learning_rate": 3.271666666666667e-06, "loss": 0.0, "num_tokens": 1232438.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 76.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.009557562880218029, "kl": 0.0031636448111385107, "learning_rate": 3.2711111111111117e-06, "loss": 0.0002, "num_tokens": 1232722.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 76.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.5434253215789795, "kl": 0.10527209937572479, "learning_rate": 3.2705555555555557e-06, "loss": 0.0016, "num_tokens": 1233043.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 4114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 76.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04371354356408119, "kl": 0.011835415847599506, "learning_rate": 3.2700000000000005e-06, "loss": 0.0006, "num_tokens": 1233364.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 76.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.1065244674682617, "kl": 0.05007171630859375, "learning_rate": 3.2694444444444444e-06, "loss": -0.0604, "num_tokens": 1233803.0, "reward": 2.174999952316284, "reward_std": 1.649999976158142, "rewards/reward_combined/mean": 2.174999952316284, "rewards/reward_combined/std": 1.649999976158142, "step": 4116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 76.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03173135593533516, "kl": 0.01887934934347868, "learning_rate": 3.268888888888889e-06, "loss": 0.0009, "num_tokens": 1234071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.5, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 76.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 1.6835732460021973, "kl": 0.14434567280113697, "learning_rate": 3.2683333333333335e-06, "loss": 0.1995, "num_tokens": 1234517.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 76.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09227301180362701, "kl": 0.010926283895969391, "learning_rate": 3.267777777777778e-06, "loss": 0.0005, "num_tokens": 1234729.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 76.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 2.057081699371338, "kl": 0.031092682853341103, "learning_rate": 3.2672222222222227e-06, "loss": 0.0977, "num_tokens": 1235037.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 76.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.019402550533413887, "kl": 0.0005402237111411523, "learning_rate": 3.266666666666667e-06, "loss": 0.0, "num_tokens": 1235293.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 76.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.022073211148381233, "kl": 0.0021987141808494925, "learning_rate": 3.2661111111111114e-06, "loss": 0.0001, "num_tokens": 1235561.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 76.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.000156149995746091, "kl": 4.08366322517395e-05, "learning_rate": 3.2655555555555557e-06, "loss": 0.0, "num_tokens": 1235781.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 76.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.332554340362549, "kl": 0.09656741097569466, "learning_rate": 3.2650000000000005e-06, "loss": 0.1628, "num_tokens": 1236094.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 4124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 76.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.008012130856513977, "kl": 0.000530291348695755, "learning_rate": 3.2644444444444444e-06, "loss": 0.0, "num_tokens": 1236338.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 76.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03820671886205673, "kl": 0.015762259252369404, "learning_rate": 3.2638888888888892e-06, "loss": 0.0008, "num_tokens": 1236674.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 76.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.13085271418094635, "kl": 0.01553339697420597, "learning_rate": 3.263333333333333e-06, "loss": 0.0008, "num_tokens": 1236972.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 76.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.645235061645508, "kl": 0.03959335805848241, "learning_rate": 3.262777777777778e-06, "loss": 0.1182, "num_tokens": 1237310.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 76.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.020231498405337334, "kl": 0.0007226656016428024, "learning_rate": 3.2622222222222227e-06, "loss": 0.0, "num_tokens": 1237580.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 76.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03054707683622837, "kl": 0.004457566887140274, "learning_rate": 3.261666666666667e-06, "loss": 0.0002, "num_tokens": 1237840.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 76.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.5628061294555664, "kl": 0.6456300715799443, "learning_rate": 3.2611111111111114e-06, "loss": 0.1601, "num_tokens": 1238101.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 76.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.018182413652539253, "kl": 0.01242586225271225, "learning_rate": 3.2605555555555558e-06, "loss": 0.0006, "num_tokens": 1238413.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4132 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.008333333767950535, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 76.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.1144959926605225, "kl": 0.15535196289420128, "learning_rate": 3.2600000000000006e-06, "loss": 0.1375, "num_tokens": 1238753.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 4133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 76.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.052281033247709274, "kl": 0.013458873145282269, "learning_rate": 3.2594444444444445e-06, "loss": 0.0007, "num_tokens": 1239071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 76.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0564735122025013, "kl": 0.03984534181654453, "learning_rate": 3.2588888888888893e-06, "loss": 0.0021, "num_tokens": 1239423.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 76.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.009123039431869984, "kl": 0.08817578107118607, "learning_rate": 3.258333333333333e-06, "loss": 0.0044, "num_tokens": 1239787.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 76.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.10388632118701935, "kl": 0.09608929604291916, "learning_rate": 3.257777777777778e-06, "loss": 0.0048, "num_tokens": 1240129.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 76.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10811858624219894, "kl": 0.2540420666337013, "learning_rate": 3.2572222222222228e-06, "loss": 0.0127, "num_tokens": 1240429.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 76.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.004248336888849735, "kl": 0.0004427194653544575, "learning_rate": 3.2566666666666667e-06, "loss": 0.0, "num_tokens": 1240649.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 76.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.047887515276670456, "kl": 0.007743888301774859, "learning_rate": 3.2561111111111115e-06, "loss": 0.0004, "num_tokens": 1240945.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 76.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.048394136130809784, "kl": 0.012839959934353828, "learning_rate": 3.255555555555556e-06, "loss": 0.0006, "num_tokens": 1241235.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 76.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.05190194025635719, "kl": 0.12497377023100853, "learning_rate": 3.255e-06, "loss": 0.0063, "num_tokens": 1241569.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 76.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.009379399009048939, "kl": 0.0021293588215485215, "learning_rate": 3.2544444444444445e-06, "loss": 0.0001, "num_tokens": 1241849.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.25, "completions/mean_terminated_length": 2.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 76.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 1.272886037826538, "kl": 0.17415320873260498, "learning_rate": 3.2538888888888893e-06, "loss": 0.0104, "num_tokens": 1242062.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 76.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.1465868353843689, "kl": 0.05989612825214863, "learning_rate": 3.2533333333333332e-06, "loss": 0.0029, "num_tokens": 1242428.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 76.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 2.152862787246704, "kl": 0.4437309354543686, "learning_rate": 3.252777777777778e-06, "loss": 0.0222, "num_tokens": 1242664.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 76.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03761902078986168, "kl": 0.0030153074767440557, "learning_rate": 3.252222222222223e-06, "loss": 0.0002, "num_tokens": 1242952.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 76.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.5608638525009155, "kl": 0.057920537889003754, "learning_rate": 3.2516666666666667e-06, "loss": 0.0029, "num_tokens": 1243164.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 76.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04080627113580704, "kl": 0.005455927224829793, "learning_rate": 3.2511111111111115e-06, "loss": 0.0003, "num_tokens": 1243468.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 76.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.13111762702465057, "kl": 0.018211786751635373, "learning_rate": 3.250555555555556e-06, "loss": 0.0009, "num_tokens": 1243702.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 76.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013243185821920633, "kl": 0.001633103413041681, "learning_rate": 3.2500000000000002e-06, "loss": 0.0001, "num_tokens": 1243982.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 76.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.19338846206665, "kl": 0.06216178834438324, "learning_rate": 3.2494444444444446e-06, "loss": -0.0091, "num_tokens": 1244283.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 76.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.15582413971424103, "kl": 0.08899224177002907, "learning_rate": 3.2488888888888894e-06, "loss": 0.0042, "num_tokens": 1244613.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 76.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.4140126705169678, "kl": 0.08671359717845917, "learning_rate": 3.2483333333333333e-06, "loss": 0.2093, "num_tokens": 1244988.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 4154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 76.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04090309143066406, "kl": 0.009490426164120436, "learning_rate": 3.247777777777778e-06, "loss": 0.0005, "num_tokens": 1245280.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 76.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.018145201727747917, "kl": 0.01441947277635336, "learning_rate": 3.247222222222223e-06, "loss": 0.0007, "num_tokens": 1245540.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 76.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08627333492040634, "kl": 0.017811238765716553, "learning_rate": 3.2466666666666668e-06, "loss": 0.0009, "num_tokens": 1245812.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 77.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0087266331538558, "kl": 0.001866291044279933, "learning_rate": 3.2461111111111116e-06, "loss": 0.0001, "num_tokens": 1246124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 77.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.13231998682022095, "kl": 0.12566576898097992, "learning_rate": 3.2455555555555555e-06, "loss": 0.0063, "num_tokens": 1246428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 77.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.011822224594652653, "kl": 0.0020604393212124705, "learning_rate": 3.2450000000000003e-06, "loss": 0.0001, "num_tokens": 1246705.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 77.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04144711792469025, "kl": 0.019003848545253277, "learning_rate": 3.2444444444444446e-06, "loss": 0.0009, "num_tokens": 1247036.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 77.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06136127933859825, "kl": 0.015186659526079893, "learning_rate": 3.2438888888888894e-06, "loss": 0.0008, "num_tokens": 1247308.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 77.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.35560113191604614, "kl": 0.040360926650464535, "learning_rate": 3.2433333333333333e-06, "loss": 0.002, "num_tokens": 1247612.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 77.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.019663682207465172, "kl": 0.2518113926053047, "learning_rate": 3.242777777777778e-06, "loss": 0.0125, "num_tokens": 1247910.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 77.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.14063337445259094, "kl": 0.04418475739657879, "learning_rate": 3.242222222222223e-06, "loss": 0.0022, "num_tokens": 1248371.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 77.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.033247705549001694, "kl": 0.014412900432944298, "learning_rate": 3.241666666666667e-06, "loss": 0.0007, "num_tokens": 1248708.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 77.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.015016169287264347, "kl": 0.0008465871214866638, "learning_rate": 3.2411111111111116e-06, "loss": 0.0, "num_tokens": 1248952.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 77.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03395623713731766, "kl": 0.0036518553970381618, "learning_rate": 3.2405555555555555e-06, "loss": 0.0002, "num_tokens": 1249259.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 77.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 1.1934179067611694, "kl": 0.24260022258386016, "learning_rate": 3.2400000000000003e-06, "loss": 0.0146, "num_tokens": 1249550.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 77.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02411009930074215, "kl": 0.16949625313282013, "learning_rate": 3.2394444444444447e-06, "loss": 0.0085, "num_tokens": 1249858.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 77.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02396230772137642, "kl": 0.0022585615515708923, "learning_rate": 3.238888888888889e-06, "loss": 0.0001, "num_tokens": 1250094.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 77.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02573191560804844, "kl": 0.012185852508991957, "learning_rate": 3.2383333333333334e-06, "loss": 0.0006, "num_tokens": 1250366.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 77.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.012346387840807438, "kl": 0.032072894275188446, "learning_rate": 3.237777777777778e-06, "loss": 0.0016, "num_tokens": 1250582.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 77.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.015076554380357265, "kl": 0.002425803744699806, "learning_rate": 3.2372222222222225e-06, "loss": 0.0001, "num_tokens": 1250870.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 77.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.002842926885932684, "kl": 0.00023853182210586965, "learning_rate": 3.236666666666667e-06, "loss": 0.0, "num_tokens": 1251126.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 77.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03840457648038864, "kl": 0.012156768701970577, "learning_rate": 3.2361111111111117e-06, "loss": 0.0006, "num_tokens": 1251451.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 77.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.007735866587609053, "kl": 0.00029540061950683594, "learning_rate": 3.2355555555555556e-06, "loss": 0.0, "num_tokens": 1251723.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 77.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020575050730258226, "kl": 2.6494264602661133e-05, "learning_rate": 3.2350000000000004e-06, "loss": 0.0, "num_tokens": 1251935.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 77.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.0034499168396, "kl": 0.11373278871178627, "learning_rate": 3.2344444444444443e-06, "loss": 0.1463, "num_tokens": 1252216.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 77.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02759678103029728, "kl": 0.0016721024876460433, "learning_rate": 3.233888888888889e-06, "loss": 0.0001, "num_tokens": 1252470.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 77.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07799834758043289, "kl": 0.009742952533997595, "learning_rate": 3.2333333333333334e-06, "loss": 0.0005, "num_tokens": 1252770.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 77.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.06416104733943939, "kl": 0.03907478367909789, "learning_rate": 3.2327777777777782e-06, "loss": 0.002, "num_tokens": 1253062.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 77.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.023604515939950943, "kl": 0.0028087663813494146, "learning_rate": 3.2322222222222226e-06, "loss": 0.0001, "num_tokens": 1253340.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 77.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.833183765411377, "kl": 0.04597795382142067, "learning_rate": 3.231666666666667e-06, "loss": 0.2131, "num_tokens": 1253695.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 4184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 77.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.08092967420816422, "kl": 0.015272689051926136, "learning_rate": 3.2311111111111117e-06, "loss": 0.0008, "num_tokens": 1254015.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 77.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016055286687333137, "kl": 2.9318034648895264e-05, "learning_rate": 3.2305555555555556e-06, "loss": 0.0, "num_tokens": 1254235.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 77.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09686514735221863, "kl": 0.01399838412180543, "learning_rate": 3.2300000000000004e-06, "loss": 0.0007, "num_tokens": 1254517.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 77.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 6.989612579345703, "kl": 1.0940078794956207, "learning_rate": 3.2294444444444443e-06, "loss": 0.131, "num_tokens": 1254868.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 4188 }, { "clip_ratio/high_max": 0.008771929889917374, "clip_ratio/high_mean": 0.008771929889917374, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008771929889917374, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 77.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.434967041015625, "kl": 0.16425710916519165, "learning_rate": 3.228888888888889e-06, "loss": -0.0625, "num_tokens": 1255200.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 77.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00862819328904152, "kl": 0.08827322348952293, "learning_rate": 3.228333333333334e-06, "loss": 0.0044, "num_tokens": 1255564.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 77.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.21070395410060883, "kl": 0.025110154223511927, "learning_rate": 3.227777777777778e-06, "loss": 0.0017, "num_tokens": 1255790.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 77.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.4740962982177734, "kl": 0.11245150864124298, "learning_rate": 3.2272222222222226e-06, "loss": 0.1063, "num_tokens": 1256134.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 77.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.6001988649368286, "kl": 0.07265210151672363, "learning_rate": 3.226666666666667e-06, "loss": 0.0036, "num_tokens": 1256430.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 77.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.07023719698190689, "kl": 0.0409962423145771, "learning_rate": 3.2261111111111113e-06, "loss": 0.0015, "num_tokens": 1256796.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 77.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.1506558656692505, "kl": 0.08068045228719711, "learning_rate": 3.2255555555555557e-06, "loss": 0.0041, "num_tokens": 1257118.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 77.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.14515283703804016, "kl": 0.016825011931359768, "learning_rate": 3.2250000000000005e-06, "loss": 0.0009, "num_tokens": 1257380.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 77.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05220238119363785, "kl": 0.005744188674725592, "learning_rate": 3.2244444444444444e-06, "loss": 0.0003, "num_tokens": 1257689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 77.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.10554438829421997, "kl": 0.037209173664450645, "learning_rate": 3.223888888888889e-06, "loss": 0.0019, "num_tokens": 1257977.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 77.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 5.064955234527588, "kl": 0.031611316837370396, "learning_rate": 3.223333333333334e-06, "loss": 0.2362, "num_tokens": 1258279.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 4199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 77.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.022512994706630707, "kl": 0.01305745029821992, "learning_rate": 3.222777777777778e-06, "loss": 0.0007, "num_tokens": 1258539.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 77.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.07443343102931976, "kl": 0.006785556674003601, "learning_rate": 3.2222222222222227e-06, "loss": 0.0003, "num_tokens": 1258799.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4201 }, { "clip_ratio/high_max": 0.010638297535479069, "clip_ratio/high_mean": 0.010638297535479069, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010638297535479069, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 77.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 5.285989284515381, "kl": 0.08067962154746056, "learning_rate": 3.221666666666667e-06, "loss": -0.0715, "num_tokens": 1259089.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 77.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.011994952335953712, "kl": 0.008880767039954662, "learning_rate": 3.2211111111111114e-06, "loss": 0.0004, "num_tokens": 1259401.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 77.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.038224026560783386, "kl": 0.008463687729090452, "learning_rate": 3.2205555555555557e-06, "loss": 0.0004, "num_tokens": 1259727.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 77.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06071503460407257, "kl": 0.004849088145419955, "learning_rate": 3.2200000000000005e-06, "loss": 0.0003, "num_tokens": 1259992.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 77.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07575024664402008, "kl": 0.002201855182647705, "learning_rate": 3.2194444444444444e-06, "loss": 0.0001, "num_tokens": 1260204.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 77.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.2094966471195221, "kl": 0.06964301690459251, "learning_rate": 3.2188888888888892e-06, "loss": 0.0038, "num_tokens": 1260540.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 77.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.9984288215637207, "kl": 0.06524031981825829, "learning_rate": 3.218333333333334e-06, "loss": -0.0537, "num_tokens": 1260879.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 77.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0878801941871643, "kl": 0.01840830221772194, "learning_rate": 3.217777777777778e-06, "loss": 0.0009, "num_tokens": 1261143.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 77.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07105470448732376, "kl": 0.005891479551792145, "learning_rate": 3.2172222222222227e-06, "loss": 0.0003, "num_tokens": 1261353.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 77.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.050667524337768555, "kl": 0.004347767157014459, "learning_rate": 3.2166666666666666e-06, "loss": 0.0002, "num_tokens": 1261587.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 78.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01956312358379364, "kl": 0.0021940982551313937, "learning_rate": 3.2161111111111114e-06, "loss": 0.0001, "num_tokens": 1261910.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 78.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.36251887679100037, "kl": 0.05162295885384083, "learning_rate": 3.2155555555555558e-06, "loss": 0.0026, "num_tokens": 1262208.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 78.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07463781535625458, "kl": 0.020119650289416313, "learning_rate": 3.215e-06, "loss": 0.001, "num_tokens": 1262494.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 78.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.010396135970950127, "kl": 0.00026292799884686247, "learning_rate": 3.2144444444444445e-06, "loss": 0.0, "num_tokens": 1262750.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.012214413844048977, "kl": 0.0003219917416572571, "learning_rate": 3.2138888888888893e-06, "loss": 0.0, "num_tokens": 1262962.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 78.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.026252923533320427, "kl": 0.005202127853408456, "learning_rate": 3.213333333333334e-06, "loss": 0.0002, "num_tokens": 1263254.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 78.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.23048518598079681, "kl": 0.023523319512605667, "learning_rate": 3.212777777777778e-06, "loss": 0.0012, "num_tokens": 1263576.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 78.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.20207923650741577, "kl": 0.033947313437238336, "learning_rate": 3.2122222222222228e-06, "loss": 0.0016, "num_tokens": 1263885.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 78.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06866934895515442, "kl": 0.023768195882439613, "learning_rate": 3.2116666666666667e-06, "loss": 0.0012, "num_tokens": 1264130.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 78.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 4.295715808868408, "kl": 0.2798140188679099, "learning_rate": 3.2111111111111115e-06, "loss": 0.0175, "num_tokens": 1264474.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 78.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08317422866821289, "kl": 0.025180064141750336, "learning_rate": 3.210555555555556e-06, "loss": 0.0013, "num_tokens": 1264742.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 78.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.07469506561756134, "kl": 0.01776110101491213, "learning_rate": 3.21e-06, "loss": 0.0009, "num_tokens": 1265026.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 78.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.23790664970874786, "kl": 0.12087386846542358, "learning_rate": 3.2094444444444445e-06, "loss": 0.0064, "num_tokens": 1265367.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 78.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03366198390722275, "kl": 0.007112078135833144, "learning_rate": 3.2088888888888893e-06, "loss": 0.0004, "num_tokens": 1265665.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 78.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02028532139956951, "kl": 0.0028079254552721977, "learning_rate": 3.2083333333333337e-06, "loss": 0.0001, "num_tokens": 1265941.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 78.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.002827334450557828, "kl": 4.531443119049072e-05, "learning_rate": 3.207777777777778e-06, "loss": 0.0, "num_tokens": 1266153.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 78.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.4627578258514404, "kl": 0.0203080247156322, "learning_rate": 3.207222222222223e-06, "loss": 0.0612, "num_tokens": 1266487.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 78.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05497918277978897, "kl": 0.005232128663919866, "learning_rate": 3.2066666666666667e-06, "loss": 0.0003, "num_tokens": 1266778.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 78.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.6072700023651123, "kl": 0.14917077869176865, "learning_rate": 3.2061111111111115e-06, "loss": -0.0424, "num_tokens": 1267138.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 78.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10464707016944885, "kl": 0.022186639718711376, "learning_rate": 3.2055555555555555e-06, "loss": 0.0011, "num_tokens": 1267411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 78.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.0722856521606445, "kl": 0.29547087848186493, "learning_rate": 3.2050000000000002e-06, "loss": 0.0376, "num_tokens": 1267747.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 4232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 78.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.025883039459586143, "kl": 0.0024399542016908526, "learning_rate": 3.2044444444444446e-06, "loss": 0.0001, "num_tokens": 1268064.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 78.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02633281797170639, "kl": 0.008916971739381552, "learning_rate": 3.2038888888888894e-06, "loss": 0.0004, "num_tokens": 1268382.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 78.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069231013767421246, "kl": 0.0040001049637794495, "learning_rate": 3.2033333333333337e-06, "loss": 0.0002, "num_tokens": 1268642.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 78.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.576848030090332, "kl": 0.03215278685092926, "learning_rate": 3.202777777777778e-06, "loss": 0.028, "num_tokens": 1268956.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 78.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.9649748802185059, "kl": 0.13642173260450363, "learning_rate": 3.202222222222223e-06, "loss": -0.0493, "num_tokens": 1269293.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 78.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004975532181560993, "kl": 0.21363212913274765, "learning_rate": 3.201666666666667e-06, "loss": 0.0107, "num_tokens": 1269597.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 78.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0070450184866786, "kl": 0.002711727051064372, "learning_rate": 3.2011111111111116e-06, "loss": 0.0001, "num_tokens": 1269881.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 78.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04775227606296539, "kl": 0.012123403139412403, "learning_rate": 3.2005555555555555e-06, "loss": 0.0006, "num_tokens": 1270153.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 78.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.010278701782227, "kl": 0.12488511763513088, "learning_rate": 3.2000000000000003e-06, "loss": 0.038, "num_tokens": 1270513.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 78.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.565549373626709, "kl": 0.021727357991039753, "learning_rate": 3.1994444444444446e-06, "loss": -0.1541, "num_tokens": 1270797.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 78.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.8418606519699097, "kl": 0.02765865297988057, "learning_rate": 3.198888888888889e-06, "loss": -0.0008, "num_tokens": 1271101.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.036448102444410324, "kl": 0.03312306106090546, "learning_rate": 3.1983333333333338e-06, "loss": 0.0017, "num_tokens": 1271317.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015668936248403043, "kl": 3.3549964427948e-05, "learning_rate": 3.197777777777778e-06, "loss": 0.0, "num_tokens": 1271537.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 78.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03827185556292534, "kl": 0.008351562079042196, "learning_rate": 3.1972222222222225e-06, "loss": 0.0005, "num_tokens": 1271805.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 78.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.019718347117304802, "kl": 0.01378082251176238, "learning_rate": 3.196666666666667e-06, "loss": 0.0007, "num_tokens": 1272065.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 78.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03133159875869751, "kl": 0.0024018955882638693, "learning_rate": 3.1961111111111116e-06, "loss": 0.0001, "num_tokens": 1272353.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 78.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.759934902191162, "kl": 0.038768935948610306, "learning_rate": 3.1955555555555555e-06, "loss": -0.0037, "num_tokens": 1272613.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.02111721597611904, "kl": 0.0017049953457899392, "learning_rate": 3.1950000000000003e-06, "loss": 0.0001, "num_tokens": 1272867.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.11189942806959152, "kl": 0.005791515111923218, "learning_rate": 3.1944444444444443e-06, "loss": 0.0003, "num_tokens": 1273085.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 78.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.07803048193454742, "kl": 0.029772132635116577, "learning_rate": 3.193888888888889e-06, "loss": 0.0014, "num_tokens": 1273411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 78.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.4409222900867462, "kl": 0.01879177545197308, "learning_rate": 3.193333333333334e-06, "loss": 0.0012, "num_tokens": 1273620.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 78.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0060469480231404305, "kl": 0.00034608939313329756, "learning_rate": 3.192777777777778e-06, "loss": 0.0, "num_tokens": 1273892.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 78.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 1.7196505069732666, "kl": 0.15110120177268982, "learning_rate": 3.1922222222222225e-06, "loss": -0.0647, "num_tokens": 1274337.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 4255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 78.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.063941240310669, "kl": 0.07675593718886375, "learning_rate": 3.191666666666667e-06, "loss": 0.1608, "num_tokens": 1274723.0, "reward": 3.375, "reward_std": 0.25, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 0.25, "step": 4256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 78.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 4.644118309020996, "kl": 0.015743468888103962, "learning_rate": 3.1911111111111117e-06, "loss": 0.4243, "num_tokens": 1275004.0, "reward": 3.549999952316284, "reward_std": 0.8999999761581421, "rewards/reward_combined/mean": 3.549999952316284, "rewards/reward_combined/std": 0.8999999761581421, "step": 4257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 86.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 86.75, "completions/mean_terminated_length": 30.33333396911621, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 78.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.083248496055603, "kl": 0.06285767629742622, "learning_rate": 3.1905555555555556e-06, "loss": 0.3433, "num_tokens": 1275571.0, "reward": 2.875, "reward_std": 5.647639751434326, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 5.647639751434326, "step": 4258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 78.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09714563190937042, "kl": 0.009757119230926037, "learning_rate": 3.1900000000000004e-06, "loss": 0.0005, "num_tokens": 1275804.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 78.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07023876905441284, "kl": 0.004095462849363685, "learning_rate": 3.1894444444444443e-06, "loss": 0.0002, "num_tokens": 1276060.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 78.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.001085838652215898, "kl": 0.0016783199389465153, "learning_rate": 3.188888888888889e-06, "loss": 0.0001, "num_tokens": 1276340.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 78.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.2531422972679138, "kl": 0.08417549729347229, "learning_rate": 3.188333333333334e-06, "loss": 0.0043, "num_tokens": 1276635.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 78.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.10724072903394699, "kl": 0.09566434472799301, "learning_rate": 3.187777777777778e-06, "loss": 0.0048, "num_tokens": 1276942.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 78.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0240892693400383, "kl": 0.16925374418497086, "learning_rate": 3.1872222222222226e-06, "loss": 0.0085, "num_tokens": 1277250.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 78.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04262186959385872, "kl": 0.002718259405810386, "learning_rate": 3.186666666666667e-06, "loss": 0.0001, "num_tokens": 1277513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 79.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.018527869135141373, "kl": 0.017068447079509497, "learning_rate": 3.1861111111111113e-06, "loss": 0.0009, "num_tokens": 1277846.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01218686904758215, "kl": 0.001913921907544136, "learning_rate": 3.1855555555555556e-06, "loss": 0.0001, "num_tokens": 1278128.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 79.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.9550271034240723, "kl": 0.13615743815898895, "learning_rate": 3.1850000000000004e-06, "loss": 0.1367, "num_tokens": 1278445.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 4268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 79.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 0.6177045702934265, "kl": 0.19624088332057, "learning_rate": 3.1844444444444444e-06, "loss": 0.0098, "num_tokens": 1278809.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 79.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.4848055839538574, "kl": 0.12044533714652061, "learning_rate": 3.183888888888889e-06, "loss": -0.0214, "num_tokens": 1279138.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 79.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05077459290623665, "kl": 0.00983051024377346, "learning_rate": 3.183333333333334e-06, "loss": 0.0005, "num_tokens": 1279411.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.015019702725112438, "kl": 0.0017954861978068948, "learning_rate": 3.182777777777778e-06, "loss": 0.0001, "num_tokens": 1279732.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 79.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.022312946617603302, "kl": 0.000650748610496521, "learning_rate": 3.1822222222222226e-06, "loss": 0.0, "num_tokens": 1279944.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 79.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04600575566291809, "kl": 0.010072236647829413, "learning_rate": 3.181666666666667e-06, "loss": 0.0005, "num_tokens": 1280231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 79.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0063865515403449535, "kl": 0.00047791004180908203, "learning_rate": 3.1811111111111113e-06, "loss": 0.0, "num_tokens": 1280475.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 79.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.09307347238063812, "kl": 0.023490027524530888, "learning_rate": 3.1805555555555557e-06, "loss": 0.0013, "num_tokens": 1280822.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 79.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.08195064216852188, "kl": 0.16778303682804108, "learning_rate": 3.1800000000000005e-06, "loss": 0.0084, "num_tokens": 1281133.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 79.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.966688632965088, "kl": 0.050434730015695095, "learning_rate": 3.1794444444444444e-06, "loss": 0.2795, "num_tokens": 1281440.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 4278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 79.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 13.213356018066406, "kl": 0.01460088649764657, "learning_rate": 3.178888888888889e-06, "loss": 0.6264, "num_tokens": 1281672.0, "reward": 2.0, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 1.7320507764816284, "step": 4279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 79.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359723091125488, "kl": 0.0743766725063324, "learning_rate": 3.178333333333334e-06, "loss": 0.0024, "num_tokens": 1282020.0, "reward": 4.5, "reward_std": 2.0, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 2.0, "step": 4280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 79.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.08965244144201279, "kl": 0.020674108061939478, "learning_rate": 3.177777777777778e-06, "loss": 0.0011, "num_tokens": 1282338.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 79.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.3187606930732727, "kl": 0.05726682394742966, "learning_rate": 3.1772222222222227e-06, "loss": 0.0028, "num_tokens": 1282660.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 4.357220649719238, "kl": 0.0137846649158746, "learning_rate": 3.1766666666666666e-06, "loss": 0.2526, "num_tokens": 1282956.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 79.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.550933301448822, "kl": 0.11317499727010727, "learning_rate": 3.1761111111111114e-06, "loss": 0.0057, "num_tokens": 1283216.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 79.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.12822124361991882, "kl": 0.04254670534282923, "learning_rate": 3.1755555555555557e-06, "loss": 0.0022, "num_tokens": 1283511.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 79.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06414917856454849, "kl": 0.027960723266005516, "learning_rate": 3.175e-06, "loss": 0.0014, "num_tokens": 1283851.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 79.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014850427396595478, "kl": 0.0034432844258844852, "learning_rate": 3.1744444444444445e-06, "loss": 0.0002, "num_tokens": 1284147.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 79.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.000156598340254277, "kl": 3.939121961593628e-05, "learning_rate": 3.1738888888888892e-06, "loss": 0.0, "num_tokens": 1284367.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 79.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.003109095385298133, "kl": 0.0005995035462547094, "learning_rate": 3.173333333333334e-06, "loss": 0.0, "num_tokens": 1284627.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 79.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.014179665595293045, "kl": 0.005053227301687002, "learning_rate": 3.172777777777778e-06, "loss": 0.0003, "num_tokens": 1284891.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 79.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01407910231500864, "kl": 0.008100613951683044, "learning_rate": 3.1722222222222227e-06, "loss": 0.0004, "num_tokens": 1285127.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 79.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.020880088210105896, "kl": 0.0010753061069408432, "learning_rate": 3.1716666666666667e-06, "loss": 0.0001, "num_tokens": 1285349.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 79.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05930650979280472, "kl": 0.07538562268018723, "learning_rate": 3.1711111111111114e-06, "loss": 0.0038, "num_tokens": 1285716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 79.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09157321602106094, "kl": 0.005927935242652893, "learning_rate": 3.1705555555555558e-06, "loss": 0.0003, "num_tokens": 1285924.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 79.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.6294119358062744, "kl": 0.1382306143641472, "learning_rate": 3.17e-06, "loss": 0.0517, "num_tokens": 1286268.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 4295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.01985904946923256, "kl": 0.22705276310443878, "learning_rate": 3.1694444444444445e-06, "loss": 0.0113, "num_tokens": 1286570.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 79.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.017894649878144264, "kl": 0.014367636758834124, "learning_rate": 3.1688888888888893e-06, "loss": 0.0007, "num_tokens": 1286830.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 79.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.022209711372852325, "kl": 0.034267500042915344, "learning_rate": 3.1683333333333336e-06, "loss": 0.0018, "num_tokens": 1287121.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 79.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08420666307210922, "kl": 0.03535403311252594, "learning_rate": 3.167777777777778e-06, "loss": 0.0019, "num_tokens": 1287393.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 79.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0177326463162899, "kl": 0.012644006870687008, "learning_rate": 3.1672222222222228e-06, "loss": 0.0006, "num_tokens": 1287705.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 79.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.6605138182640076, "kl": 0.065140126273036, "learning_rate": 3.1666666666666667e-06, "loss": 0.0035, "num_tokens": 1287992.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 79.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.08781445771455765, "kl": 0.04065949935466051, "learning_rate": 3.1661111111111115e-06, "loss": 0.002, "num_tokens": 1288297.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.025546075776219368, "kl": 0.00506246299482882, "learning_rate": 3.1655555555555554e-06, "loss": 0.0003, "num_tokens": 1288585.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 79.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031281912233680487, "kl": 0.002861990360543132, "learning_rate": 3.165e-06, "loss": 0.0001, "num_tokens": 1288869.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 79.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08879492431879044, "kl": 0.03214646503329277, "learning_rate": 3.1644444444444445e-06, "loss": 0.0016, "num_tokens": 1289206.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 79.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02567167580127716, "kl": 0.04449847154319286, "learning_rate": 3.1638888888888893e-06, "loss": 0.0022, "num_tokens": 1289662.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 79.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07063457369804382, "kl": 0.0027805149074993096, "learning_rate": 3.1633333333333337e-06, "loss": 0.0001, "num_tokens": 1289918.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 79.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.050275545567274094, "kl": 0.017322886735200882, "learning_rate": 3.162777777777778e-06, "loss": 0.0009, "num_tokens": 1290252.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 79.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01905231736600399, "kl": 0.0011065587168559432, "learning_rate": 3.162222222222223e-06, "loss": 0.0001, "num_tokens": 1290568.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 79.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.4008748531341553, "kl": 0.08028214052319527, "learning_rate": 3.1616666666666667e-06, "loss": 0.0244, "num_tokens": 1290885.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 79.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03192393481731415, "kl": 0.0027828688616864383, "learning_rate": 3.1611111111111115e-06, "loss": 0.0001, "num_tokens": 1291141.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 79.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.454204559326172, "kl": 0.12561389058828354, "learning_rate": 3.1605555555555555e-06, "loss": 0.1641, "num_tokens": 1291453.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 79.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005525216925889254, "kl": 0.00027983245672658086, "learning_rate": 3.1600000000000002e-06, "loss": 0.0, "num_tokens": 1291688.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 79.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.010643990710377693, "kl": 0.031518734991550446, "learning_rate": 3.159444444444445e-06, "loss": 0.0016, "num_tokens": 1291904.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 79.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.36021268367767334, "kl": 0.057238608598709106, "learning_rate": 3.158888888888889e-06, "loss": 0.0032, "num_tokens": 1292194.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 79.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03324849531054497, "kl": 0.04484066367149353, "learning_rate": 3.1583333333333337e-06, "loss": 0.0017, "num_tokens": 1292551.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 79.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 13.284072875976562, "kl": 0.03742528101429343, "learning_rate": 3.157777777777778e-06, "loss": 0.0528, "num_tokens": 1292879.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 79.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03020230121910572, "kl": 0.0027833332715090364, "learning_rate": 3.1572222222222224e-06, "loss": 0.0001, "num_tokens": 1293151.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 79.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02535565197467804, "kl": 0.004383474588394165, "learning_rate": 3.156666666666667e-06, "loss": 0.0002, "num_tokens": 1293411.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 80.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.017647024244070053, "kl": 0.0043428558856248856, "learning_rate": 3.1561111111111116e-06, "loss": 0.0002, "num_tokens": 1293715.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 80.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 0.8233867883682251, "kl": 0.05103733763098717, "learning_rate": 3.1555555555555555e-06, "loss": -0.0413, "num_tokens": 1294177.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 4321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 80.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015613944560755044, "kl": 3.950297832489014e-05, "learning_rate": 3.1550000000000003e-06, "loss": 0.0, "num_tokens": 1294397.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 80.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 6.113419055938721, "kl": 0.010589133482426405, "learning_rate": 3.154444444444445e-06, "loss": 0.1583, "num_tokens": 1294672.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 80.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.023707250133156776, "kl": 0.004169234540313482, "learning_rate": 3.153888888888889e-06, "loss": 0.0002, "num_tokens": 1294966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 80.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.015043220482766628, "kl": 0.01029182830825448, "learning_rate": 3.1533333333333338e-06, "loss": 0.0005, "num_tokens": 1295282.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 80.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.7025742530822754, "kl": 0.00400135014206171, "learning_rate": 3.152777777777778e-06, "loss": 0.2596, "num_tokens": 1295601.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 80.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03832979500293732, "kl": 0.0012608403631020337, "learning_rate": 3.1522222222222225e-06, "loss": 0.0001, "num_tokens": 1295869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 80.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03631162270903587, "kl": 0.02020381996408105, "learning_rate": 3.151666666666667e-06, "loss": 0.001, "num_tokens": 1296158.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 80.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.003926755394786596, "kl": 0.000180485840246547, "learning_rate": 3.1511111111111116e-06, "loss": 0.0, "num_tokens": 1296414.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 80.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.025338731706142426, "kl": 0.0027392454212531447, "learning_rate": 3.1505555555555556e-06, "loss": 0.0001, "num_tokens": 1296716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 80.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.015181378461420536, "kl": 0.007825560867786407, "learning_rate": 3.1500000000000003e-06, "loss": 0.0004, "num_tokens": 1296952.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 80.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.00734187476336956, "kl": 0.0011956244707107544, "learning_rate": 3.149444444444445e-06, "loss": 0.0001, "num_tokens": 1297164.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 80.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033876688685268164, "kl": 0.0005980640416964889, "learning_rate": 3.148888888888889e-06, "loss": 0.0, "num_tokens": 1297424.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 80.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.00895024836063385, "kl": 0.0015191601123660803, "learning_rate": 3.148333333333334e-06, "loss": 0.0001, "num_tokens": 1297736.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 80.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.047252263873815536, "kl": 0.0031125694513320923, "learning_rate": 3.1477777777777778e-06, "loss": 0.0001, "num_tokens": 1297946.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 80.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.018009020015597343, "kl": 0.015318500809371471, "learning_rate": 3.1472222222222225e-06, "loss": 0.0008, "num_tokens": 1298274.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 80.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.8974551558494568, "kl": 0.23364466801285744, "learning_rate": 3.146666666666667e-06, "loss": 0.0114, "num_tokens": 1298580.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 80.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.011365102604031563, "kl": 0.21416565775871277, "learning_rate": 3.1461111111111112e-06, "loss": 0.0107, "num_tokens": 1298884.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 80.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02699992246925831, "kl": 0.033140629529953, "learning_rate": 3.1455555555555556e-06, "loss": 0.0017, "num_tokens": 1299100.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 80.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.025332679972052574, "kl": 0.0010905265808105469, "learning_rate": 3.1450000000000004e-06, "loss": 0.0001, "num_tokens": 1299312.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 80.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.057085152715444565, "kl": 0.03728696145117283, "learning_rate": 3.144444444444445e-06, "loss": 0.0018, "num_tokens": 1299668.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 80.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.020696111023426056, "kl": 0.004296638071537018, "learning_rate": 3.143888888888889e-06, "loss": 0.0002, "num_tokens": 1299928.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 80.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.7494077682495117, "kl": 0.04226653277873993, "learning_rate": 3.143333333333334e-06, "loss": 0.0754, "num_tokens": 1300268.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 80.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11019372940063477, "kl": 0.019599175080657005, "learning_rate": 3.142777777777778e-06, "loss": 0.0011, "num_tokens": 1300515.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 80.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007206641021184623, "kl": 0.0016708076000213623, "learning_rate": 3.1422222222222226e-06, "loss": 0.0001, "num_tokens": 1300795.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 80.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02004384435713291, "kl": 0.013820333406329155, "learning_rate": 3.141666666666667e-06, "loss": 0.0007, "num_tokens": 1301055.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 80.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.017055010423064232, "kl": 0.0049467021599411964, "learning_rate": 3.1411111111111113e-06, "loss": 0.0002, "num_tokens": 1301367.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 80.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 6.4266486167907715, "kl": 0.10471184179186821, "learning_rate": 3.1405555555555557e-06, "loss": 0.0826, "num_tokens": 1301632.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 4348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 80.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.8525702953338623, "kl": 0.20154965668916702, "learning_rate": 3.1400000000000004e-06, "loss": 0.1107, "num_tokens": 1301995.0, "reward": 4.625, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 4.308422088623047, "step": 4349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 80.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.485304832458496, "kl": 0.058997915126383305, "learning_rate": 3.1394444444444448e-06, "loss": 0.0354, "num_tokens": 1302266.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 77.5, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 77.5, "completions/mean_terminated_length": 77.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 80.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.3563562631607056, "kl": 0.02683593798428774, "learning_rate": 3.138888888888889e-06, "loss": -0.1375, "num_tokens": 1302796.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 4351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 80.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.13758128881454468, "kl": 0.10089828073978424, "learning_rate": 3.138333333333334e-06, "loss": 0.0051, "num_tokens": 1303140.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 80.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07379338145256042, "kl": 0.05710730329155922, "learning_rate": 3.137777777777778e-06, "loss": 0.003, "num_tokens": 1303488.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 80.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.937513828277588, "kl": 0.11700394377112389, "learning_rate": 3.1372222222222226e-06, "loss": 0.0305, "num_tokens": 1303792.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 80.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.39007407426834106, "kl": 0.21336011588573456, "learning_rate": 3.1366666666666666e-06, "loss": 0.0107, "num_tokens": 1304112.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 80.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 6.9034104347229, "kl": 0.02412063255906105, "learning_rate": 3.1361111111111113e-06, "loss": 0.2695, "num_tokens": 1304413.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 4356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 80.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.10573821514844894, "kl": 0.016713889315724373, "learning_rate": 3.1355555555555557e-06, "loss": 0.0008, "num_tokens": 1304687.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 80.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06394197791814804, "kl": 0.009187193354591727, "learning_rate": 3.135e-06, "loss": 0.0005, "num_tokens": 1305017.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 80.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.011955702677369118, "kl": 0.0011162332375533879, "learning_rate": 3.134444444444445e-06, "loss": 0.0001, "num_tokens": 1305297.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 80.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03222492337226868, "kl": 0.007519067265093327, "learning_rate": 3.133888888888889e-06, "loss": 0.0004, "num_tokens": 1305630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 80.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045545389875769615, "kl": 0.0003989339020336047, "learning_rate": 3.133333333333334e-06, "loss": 0.0, "num_tokens": 1305850.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 80.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06095147505402565, "kl": 0.014355242252349854, "learning_rate": 3.132777777777778e-06, "loss": 0.0007, "num_tokens": 1306122.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 80.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.05008542165160179, "kl": 0.003955689258873463, "learning_rate": 3.1322222222222227e-06, "loss": 0.0002, "num_tokens": 1306380.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 80.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05837956815958023, "kl": 0.07486037164926529, "learning_rate": 3.1316666666666666e-06, "loss": 0.0037, "num_tokens": 1306744.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 80.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.16052016615867615, "kl": 0.046896965242922306, "learning_rate": 3.1311111111111114e-06, "loss": 0.0025, "num_tokens": 1307046.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 80.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1975441724061966, "kl": 0.03922533802688122, "learning_rate": 3.1305555555555557e-06, "loss": 0.0021, "num_tokens": 1307380.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 80.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 8.662459373474121, "kl": 0.006338220497127622, "learning_rate": 3.13e-06, "loss": -0.0421, "num_tokens": 1307615.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 4367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 80.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.027222774922847748, "kl": 0.045170003548264503, "learning_rate": 3.129444444444445e-06, "loss": 0.0023, "num_tokens": 1307909.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 80.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06733706593513489, "kl": 0.01072357827797532, "learning_rate": 3.1288888888888892e-06, "loss": 0.0006, "num_tokens": 1308187.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 80.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.16168256103992462, "kl": 0.04361439496278763, "learning_rate": 3.1283333333333336e-06, "loss": 0.0023, "num_tokens": 1308458.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 80.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023437044583261013, "kl": 0.09029536694288254, "learning_rate": 3.127777777777778e-06, "loss": 0.0045, "num_tokens": 1308822.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 80.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.018413498997688293, "kl": 0.0035926327109336853, "learning_rate": 3.1272222222222227e-06, "loss": 0.0002, "num_tokens": 1309110.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 80.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.7150583267211914, "kl": 0.20260045677423477, "learning_rate": 3.1266666666666667e-06, "loss": 0.0102, "num_tokens": 1309416.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019898810423910618, "kl": 0.0026773191057145596, "learning_rate": 3.1261111111111114e-06, "loss": 0.0001, "num_tokens": 1309700.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 81.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0085519440472126, "kl": 0.00026230166986351833, "learning_rate": 3.1255555555555554e-06, "loss": 0.0, "num_tokens": 1309970.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 81.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03985312208533287, "kl": 0.00444625411182642, "learning_rate": 3.125e-06, "loss": 0.0002, "num_tokens": 1310205.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 81.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.590732097625732, "kl": 0.08877254463732243, "learning_rate": 3.124444444444445e-06, "loss": 0.0462, "num_tokens": 1310479.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04741493985056877, "kl": 0.014360523782670498, "learning_rate": 3.1238888888888893e-06, "loss": 0.0008, "num_tokens": 1310775.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 81.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.16746872663497925, "kl": 0.010923224966973066, "learning_rate": 3.1233333333333336e-06, "loss": 0.0005, "num_tokens": 1310993.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 81.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.09233638644218445, "kl": 0.008117337943986058, "learning_rate": 3.122777777777778e-06, "loss": 0.0005, "num_tokens": 1311272.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 81.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.029379308223724365, "kl": 0.00284440815448761, "learning_rate": 3.1222222222222228e-06, "loss": 0.0001, "num_tokens": 1311482.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 81.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.007094435393810272, "kl": 0.002364064916037023, "learning_rate": 3.1216666666666667e-06, "loss": 0.0001, "num_tokens": 1311750.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 81.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.045564644038677216, "kl": 0.08419395983219147, "learning_rate": 3.1211111111111115e-06, "loss": 0.0042, "num_tokens": 1312094.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 81.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03538648784160614, "kl": 0.01619787933304906, "learning_rate": 3.1205555555555554e-06, "loss": 0.0008, "num_tokens": 1312420.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 81.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 1.8147995471954346, "kl": 0.11192277260124683, "learning_rate": 3.12e-06, "loss": 0.0071, "num_tokens": 1312709.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.29189532995224, "kl": 0.060461766086518764, "learning_rate": 3.119444444444445e-06, "loss": 0.0029, "num_tokens": 1312995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 81.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.49240779876709, "kl": 0.09544357843697071, "learning_rate": 3.118888888888889e-06, "loss": 0.2364, "num_tokens": 1313321.0, "reward": 4.375, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 2.0966243743896484, "step": 4387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 81.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.01588946022093296, "kl": 0.007595717906951904, "learning_rate": 3.1183333333333337e-06, "loss": 0.0004, "num_tokens": 1313557.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 81.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.689061164855957, "kl": 0.20124905556440353, "learning_rate": 3.117777777777778e-06, "loss": 0.0105, "num_tokens": 1313883.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 81.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01300664059817791, "kl": 0.034317681565880775, "learning_rate": 3.1172222222222224e-06, "loss": 0.0017, "num_tokens": 1314347.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 81.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02312917448580265, "kl": 0.013099312782287598, "learning_rate": 3.1166666666666668e-06, "loss": 0.0007, "num_tokens": 1314607.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 81.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016806994972284883, "kl": 2.5853514671325684e-05, "learning_rate": 3.1161111111111115e-06, "loss": 0.0, "num_tokens": 1314827.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 81.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.040543075650930405, "kl": 0.0018419435364194214, "learning_rate": 3.1155555555555555e-06, "loss": 0.0001, "num_tokens": 1315105.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 81.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01712915673851967, "kl": 0.005760894506238401, "learning_rate": 3.1150000000000002e-06, "loss": 0.0003, "num_tokens": 1315396.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 81.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.24713753163814545, "kl": 0.050096508115530014, "learning_rate": 3.114444444444445e-06, "loss": 0.0026, "num_tokens": 1315738.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 81.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.026880159974098206, "kl": 0.0098980360198766, "learning_rate": 3.113888888888889e-06, "loss": 0.0005, "num_tokens": 1316074.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 81.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.536242961883545, "kl": 0.0662765372544527, "learning_rate": 3.1133333333333337e-06, "loss": 0.2846, "num_tokens": 1316372.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 81.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.051955413073301315, "kl": 0.03848607838153839, "learning_rate": 3.112777777777778e-06, "loss": 0.0019, "num_tokens": 1316715.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 81.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.033156801015138626, "kl": 0.004855424165725708, "learning_rate": 3.1122222222222224e-06, "loss": 0.0002, "num_tokens": 1316975.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 81.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08870881050825119, "kl": 0.010768704814836383, "learning_rate": 3.111666666666667e-06, "loss": 0.0006, "num_tokens": 1317289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 81.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.024927282705903053, "kl": 0.008057301864027977, "learning_rate": 3.1111111111111116e-06, "loss": 0.0004, "num_tokens": 1317607.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 81.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.5369508266448975, "kl": 0.04546524025499821, "learning_rate": 3.1105555555555555e-06, "loss": 0.0518, "num_tokens": 1317906.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 81.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02478041686117649, "kl": 0.08950205519795418, "learning_rate": 3.1100000000000003e-06, "loss": 0.0045, "num_tokens": 1318273.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 81.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.028225863352417946, "kl": 0.07931605353951454, "learning_rate": 3.109444444444445e-06, "loss": 0.004, "num_tokens": 1318637.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 81.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00717367697507143, "kl": 0.0009530782699584961, "learning_rate": 3.108888888888889e-06, "loss": 0.0, "num_tokens": 1318849.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 81.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071960557252168655, "kl": 0.0007638931274414062, "learning_rate": 3.1083333333333338e-06, "loss": 0.0, "num_tokens": 1319061.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 81.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.017452774569392204, "kl": 0.006980649312026799, "learning_rate": 3.1077777777777777e-06, "loss": 0.0004, "num_tokens": 1319325.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07999138534069061, "kl": 0.01494816830381751, "learning_rate": 3.1072222222222225e-06, "loss": 0.0008, "num_tokens": 1319612.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 81.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 7.455813407897949, "kl": 0.01749139279127121, "learning_rate": 3.106666666666667e-06, "loss": 0.0221, "num_tokens": 1319895.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 81.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.10757986456155777, "kl": 0.05786377191543579, "learning_rate": 3.106111111111111e-06, "loss": 0.0028, "num_tokens": 1320202.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 81.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 1.7759891748428345, "kl": 0.24557029269635677, "learning_rate": 3.1055555555555556e-06, "loss": 0.2222, "num_tokens": 1320564.0, "reward": 2.875, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 3.3008837699890137, "step": 4411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 81.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.020743679255247116, "kl": 0.0025859649758785963, "learning_rate": 3.1050000000000003e-06, "loss": 0.0001, "num_tokens": 1320872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 81.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 6.2079620361328125, "kl": 0.022927332669496536, "learning_rate": 3.104444444444445e-06, "loss": 0.1382, "num_tokens": 1321134.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 81.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.04203316941857338, "kl": 0.005484391935169697, "learning_rate": 3.103888888888889e-06, "loss": 0.0003, "num_tokens": 1321394.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 81.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07193493098020554, "kl": 0.15114140510559082, "learning_rate": 3.103333333333334e-06, "loss": 0.0076, "num_tokens": 1321701.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 81.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.12404725700616837, "kl": 0.11130328103899956, "learning_rate": 3.1027777777777778e-06, "loss": 0.0055, "num_tokens": 1322001.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 81.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01631123386323452, "kl": 0.00903015211224556, "learning_rate": 3.1022222222222225e-06, "loss": 0.0005, "num_tokens": 1322313.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.75, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 81.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.059100184589624405, "kl": 0.025700380094349384, "learning_rate": 3.101666666666667e-06, "loss": 0.0013, "num_tokens": 1322700.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 81.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.1818432807922363, "kl": 0.17147932946681976, "learning_rate": 3.1011111111111113e-06, "loss": -0.0828, "num_tokens": 1323037.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 81.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.004192747175693512, "kl": 0.00016041398703237064, "learning_rate": 3.1005555555555556e-06, "loss": 0.0, "num_tokens": 1323293.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 81.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.4636780917644501, "kl": 0.14608434587717056, "learning_rate": 3.1000000000000004e-06, "loss": 0.0071, "num_tokens": 1323568.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 81.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03988955542445183, "kl": 0.009444834664463997, "learning_rate": 3.0994444444444447e-06, "loss": 0.0005, "num_tokens": 1323885.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 81.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.490312099456787, "kl": 0.4155937656760216, "learning_rate": 3.098888888888889e-06, "loss": 0.084, "num_tokens": 1324102.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 4423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 81.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.022325368598103523, "kl": 0.0019186859135515988, "learning_rate": 3.098333333333334e-06, "loss": 0.0001, "num_tokens": 1324429.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 81.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.23466336727142334, "kl": 0.05790402181446552, "learning_rate": 3.097777777777778e-06, "loss": 0.0028, "num_tokens": 1324701.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 81.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.008585622534155846, "kl": 0.00048553571105003357, "learning_rate": 3.0972222222222226e-06, "loss": 0.0, "num_tokens": 1324945.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 81.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.5460911393165588, "kl": 0.0611006380058825, "learning_rate": 3.0966666666666665e-06, "loss": 0.0035, "num_tokens": 1325245.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 82.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01971575804054737, "kl": 0.25284604728221893, "learning_rate": 3.0961111111111113e-06, "loss": 0.0126, "num_tokens": 1325543.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 82.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.590091705322266, "kl": 0.08176860585808754, "learning_rate": 3.0955555555555557e-06, "loss": 0.0301, "num_tokens": 1325816.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 82.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03928106650710106, "kl": 0.008261157432571054, "learning_rate": 3.0950000000000004e-06, "loss": 0.0004, "num_tokens": 1326145.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 82.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 5.083174705505371, "kl": 0.022220906510483474, "learning_rate": 3.094444444444445e-06, "loss": 0.05, "num_tokens": 1326420.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 82.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.08553533256053925, "kl": 0.024374770000576973, "learning_rate": 3.093888888888889e-06, "loss": 0.0011, "num_tokens": 1326670.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 82.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04551943764090538, "kl": 0.01381648937240243, "learning_rate": 3.093333333333334e-06, "loss": 0.0007, "num_tokens": 1326995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 82.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016395577404182404, "kl": 3.582984209060669e-05, "learning_rate": 3.092777777777778e-06, "loss": 0.0, "num_tokens": 1327215.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 82.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00176240224391222, "kl": 0.0011809704592451453, "learning_rate": 3.0922222222222226e-06, "loss": 0.0001, "num_tokens": 1327534.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 82.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08222459256649017, "kl": 0.0086030513048172, "learning_rate": 3.0916666666666666e-06, "loss": 0.0004, "num_tokens": 1327740.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 82.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.016295917332172394, "kl": 0.24003278464078903, "learning_rate": 3.0911111111111114e-06, "loss": 0.0119, "num_tokens": 1328040.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 82.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.014927576296031475, "kl": 0.008028656244277954, "learning_rate": 3.0905555555555557e-06, "loss": 0.0004, "num_tokens": 1328276.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 82.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.018694352358579636, "kl": 0.001320989365922287, "learning_rate": 3.09e-06, "loss": 0.0001, "num_tokens": 1328554.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 82.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.2997117042541504, "kl": 0.8648179098963737, "learning_rate": 3.089444444444445e-06, "loss": 0.0419, "num_tokens": 1328918.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 82.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.5388180613517761, "kl": 0.06766625493764877, "learning_rate": 3.088888888888889e-06, "loss": 0.0042, "num_tokens": 1329253.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 82.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04010939970612526, "kl": 0.012326718308031559, "learning_rate": 3.0883333333333336e-06, "loss": 0.0006, "num_tokens": 1329542.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 82.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.0994958877563477, "kl": 0.2464097999036312, "learning_rate": 3.087777777777778e-06, "loss": 0.0084, "num_tokens": 1329893.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 4443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 82.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.08565664291381836, "kl": 0.006236575776711106, "learning_rate": 3.0872222222222227e-06, "loss": 0.0003, "num_tokens": 1330157.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 82.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.005120075773447752, "kl": 0.0005956828827038407, "learning_rate": 3.0866666666666666e-06, "loss": 0.0, "num_tokens": 1330377.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4445 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 82.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 4.0752763748168945, "kl": 0.132355697453022, "learning_rate": 3.0861111111111114e-06, "loss": 0.0822, "num_tokens": 1330676.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 82.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01840001717209816, "kl": 0.010361284017562866, "learning_rate": 3.085555555555556e-06, "loss": 0.0005, "num_tokens": 1330948.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 82.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.13516414165496826, "kl": 0.05109521560370922, "learning_rate": 3.085e-06, "loss": 0.0025, "num_tokens": 1331218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 82.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.09234651923179626, "kl": 0.01135116326622665, "learning_rate": 3.084444444444445e-06, "loss": 0.0004, "num_tokens": 1331472.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 82.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.4805350303649902, "kl": 0.5344561208039522, "learning_rate": 3.0838888888888892e-06, "loss": -0.0371, "num_tokens": 1331737.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 82.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03390654921531677, "kl": 0.026431579142808914, "learning_rate": 3.0833333333333336e-06, "loss": 0.0013, "num_tokens": 1332005.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 82.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023596342653036118, "kl": 0.003421967616304755, "learning_rate": 3.082777777777778e-06, "loss": 0.0002, "num_tokens": 1332301.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 82.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.659102201461792, "kl": 0.007497334852814674, "learning_rate": 3.0822222222222227e-06, "loss": 0.0021, "num_tokens": 1332613.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 82.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04796583950519562, "kl": 0.0026250871596857905, "learning_rate": 3.0816666666666667e-06, "loss": 0.0001, "num_tokens": 1332848.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 82.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.08831838518381119, "kl": 0.16380514949560165, "learning_rate": 3.0811111111111114e-06, "loss": 0.0082, "num_tokens": 1333198.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 82.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.14704734086990356, "kl": 0.029194061644375324, "learning_rate": 3.0805555555555562e-06, "loss": 0.0015, "num_tokens": 1333514.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 82.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.037573736160993576, "kl": 0.0420612096786499, "learning_rate": 3.08e-06, "loss": 0.0021, "num_tokens": 1333982.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 82.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.00833174679428339, "kl": 0.00020316839800216258, "learning_rate": 3.079444444444445e-06, "loss": 0.0, "num_tokens": 1334238.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 82.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11214038729667664, "kl": 0.03892693854868412, "learning_rate": 3.078888888888889e-06, "loss": 0.002, "num_tokens": 1334571.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4459 }, { "clip_ratio/high_max": 0.014285714365541935, "clip_ratio/high_mean": 0.014285714365541935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014285714365541935, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 82.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.2904534339904785, "kl": 0.1437351256608963, "learning_rate": 3.0783333333333336e-06, "loss": 0.0711, "num_tokens": 1334887.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 82.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.013425588607788086, "kl": 0.034443989396095276, "learning_rate": 3.077777777777778e-06, "loss": 0.0017, "num_tokens": 1335103.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 82.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.109557628631592, "kl": 0.06434831023216248, "learning_rate": 3.0772222222222224e-06, "loss": 0.0168, "num_tokens": 1335429.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 82.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05430451035499573, "kl": 0.034844234585762024, "learning_rate": 3.0766666666666667e-06, "loss": 0.0017, "num_tokens": 1335722.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 82.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.33623984456062317, "kl": 0.038357241079211235, "learning_rate": 3.0761111111111115e-06, "loss": 0.0019, "num_tokens": 1336017.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 82.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.4468958079814911, "kl": 0.10483380127698183, "learning_rate": 3.0755555555555563e-06, "loss": 0.0057, "num_tokens": 1336316.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 82.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 13.192193031311035, "kl": 0.027675900608301163, "learning_rate": 3.075e-06, "loss": 0.1876, "num_tokens": 1336534.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 82.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.049637261778116226, "kl": 0.013700686860829592, "learning_rate": 3.074444444444445e-06, "loss": 0.0007, "num_tokens": 1336824.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 82.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.012887080200016499, "kl": 0.003232288174331188, "learning_rate": 3.073888888888889e-06, "loss": 0.0002, "num_tokens": 1337106.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 82.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07298915088176727, "kl": 0.01801220327615738, "learning_rate": 3.0733333333333337e-06, "loss": 0.0009, "num_tokens": 1337436.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 82.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.020823776721954346, "kl": 0.0026984004070982337, "learning_rate": 3.072777777777778e-06, "loss": 0.0001, "num_tokens": 1337696.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 82.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0074548861011862755, "kl": 0.0017378032207489014, "learning_rate": 3.0722222222222224e-06, "loss": 0.0001, "num_tokens": 1337908.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 82.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02505820244550705, "kl": 0.0046468451619148254, "learning_rate": 3.0716666666666668e-06, "loss": 0.0002, "num_tokens": 1338168.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 82.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.42270952463150024, "kl": 0.1137915551662445, "learning_rate": 3.0711111111111115e-06, "loss": 0.0057, "num_tokens": 1338533.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 82.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008605095208622515, "kl": 0.001728860370349139, "learning_rate": 3.070555555555556e-06, "loss": 0.0001, "num_tokens": 1338813.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 82.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10469565540552139, "kl": 0.08472064882516861, "learning_rate": 3.0700000000000003e-06, "loss": 0.0042, "num_tokens": 1339121.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 82.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.19313621520996094, "kl": 0.06837853789329529, "learning_rate": 3.069444444444445e-06, "loss": 0.0035, "num_tokens": 1339469.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 82.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.008281392976641655, "kl": 0.002388894557952881, "learning_rate": 3.068888888888889e-06, "loss": 0.0001, "num_tokens": 1339781.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 82.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06955397874116898, "kl": 0.010496057104319334, "learning_rate": 3.0683333333333337e-06, "loss": 0.0005, "num_tokens": 1340086.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 82.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.1862375736236572, "kl": 0.021146115846931934, "learning_rate": 3.0677777777777777e-06, "loss": 0.0547, "num_tokens": 1340361.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 82.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.31566423177719116, "kl": 0.18131405115127563, "learning_rate": 3.0672222222222225e-06, "loss": 0.0092, "num_tokens": 1340700.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 82.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07030584663152695, "kl": 0.01632215827703476, "learning_rate": 3.066666666666667e-06, "loss": 0.0008, "num_tokens": 1340984.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 83.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.05633677542209625, "kl": 0.018936037085950375, "learning_rate": 3.066111111111111e-06, "loss": 0.001, "num_tokens": 1341245.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01600354164838791, "kl": 0.005610186140984297, "learning_rate": 3.065555555555556e-06, "loss": 0.0003, "num_tokens": 1341543.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 83.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.012941837310791, "kl": 0.1549070030450821, "learning_rate": 3.0650000000000003e-06, "loss": 0.0508, "num_tokens": 1341869.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 83.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.07541295886039734, "kl": 0.0976540595293045, "learning_rate": 3.064444444444445e-06, "loss": 0.0049, "num_tokens": 1342236.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 83.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.4065916836261749, "kl": 0.0959920585155487, "learning_rate": 3.063888888888889e-06, "loss": 0.0048, "num_tokens": 1342497.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 83.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05109339952468872, "kl": 0.02284581959247589, "learning_rate": 3.063333333333334e-06, "loss": 0.0011, "num_tokens": 1342817.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 83.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06537017226219177, "kl": 0.005989123834297061, "learning_rate": 3.0627777777777777e-06, "loss": 0.0003, "num_tokens": 1343071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 83.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.5806751251220703, "kl": 0.29591700062155724, "learning_rate": 3.0622222222222225e-06, "loss": 0.1224, "num_tokens": 1343416.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 4489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 83.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03540436550974846, "kl": 0.008538441499695182, "learning_rate": 3.061666666666667e-06, "loss": 0.0004, "num_tokens": 1343706.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 83.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.020314816385507584, "kl": 0.05143996514379978, "learning_rate": 3.0611111111111112e-06, "loss": 0.0026, "num_tokens": 1343999.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.04642611742019653, "kl": 0.03639167547225952, "learning_rate": 3.060555555555556e-06, "loss": 0.0019, "num_tokens": 1344269.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 83.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.12113795429468155, "kl": 0.013638820732012391, "learning_rate": 3.0600000000000003e-06, "loss": 0.0007, "num_tokens": 1344531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.022341124713420868, "kl": 0.003307503997348249, "learning_rate": 3.0594444444444447e-06, "loss": 0.0002, "num_tokens": 1344811.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 83.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.09029962867498398, "kl": 0.007985096890479326, "learning_rate": 3.058888888888889e-06, "loss": 0.0003, "num_tokens": 1345059.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 83.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.11174161732196808, "kl": 0.020781581290066242, "learning_rate": 3.058333333333334e-06, "loss": 0.001, "num_tokens": 1345331.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 83.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.001989157637581229, "kl": 0.0001675009734753985, "learning_rate": 3.0577777777777778e-06, "loss": 0.0, "num_tokens": 1345587.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.23476377129554749, "kl": 0.03334043640643358, "learning_rate": 3.0572222222222226e-06, "loss": 0.0017, "num_tokens": 1345854.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 83.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016491430869791657, "kl": 3.477931022644043e-05, "learning_rate": 3.0566666666666665e-06, "loss": 0.0, "num_tokens": 1346074.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 83.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.02989822067320347, "kl": 0.0017163055599667132, "learning_rate": 3.0561111111111113e-06, "loss": 0.0001, "num_tokens": 1346348.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 83.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04131662845611572, "kl": 0.003536120057106018, "learning_rate": 3.055555555555556e-06, "loss": 0.0002, "num_tokens": 1346562.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 83.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11572865396738052, "kl": 0.08903741091489792, "learning_rate": 3.0550000000000004e-06, "loss": 0.0044, "num_tokens": 1346867.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 83.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.012784760445356369, "kl": 0.0020993222715333104, "learning_rate": 3.0544444444444448e-06, "loss": 0.0001, "num_tokens": 1347165.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 83.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09112471342086792, "kl": 0.041279034689068794, "learning_rate": 3.053888888888889e-06, "loss": 0.0021, "num_tokens": 1347627.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013807018985971808, "kl": 0.00188433937728405, "learning_rate": 3.053333333333334e-06, "loss": 0.0001, "num_tokens": 1347904.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 83.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 5.675228595733643, "kl": 0.01511330419452861, "learning_rate": 3.052777777777778e-06, "loss": 0.1212, "num_tokens": 1348127.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 83.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.016671353951096535, "kl": 0.0075468942523002625, "learning_rate": 3.0522222222222226e-06, "loss": 0.0004, "num_tokens": 1348363.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 83.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03309008106589317, "kl": 0.0014683455228805542, "learning_rate": 3.0516666666666665e-06, "loss": 0.0001, "num_tokens": 1348575.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 83.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.14492397010326385, "kl": 0.1805749237537384, "learning_rate": 3.0511111111111113e-06, "loss": 0.009, "num_tokens": 1348884.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 83.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.013141773641109467, "kl": 0.03411140292882919, "learning_rate": 3.050555555555556e-06, "loss": 0.0017, "num_tokens": 1349100.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 83.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008403259329497814, "kl": 0.0004317334678489715, "learning_rate": 3.05e-06, "loss": 0.0, "num_tokens": 1349336.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 83.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036691694986075163, "kl": 0.017022019252181053, "learning_rate": 3.049444444444445e-06, "loss": 0.0009, "num_tokens": 1349648.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 83.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.014448894187808037, "kl": 0.010950278490781784, "learning_rate": 3.048888888888889e-06, "loss": 0.0005, "num_tokens": 1349964.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 83.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.826044797897339, "kl": 0.1250813752412796, "learning_rate": 3.0483333333333335e-06, "loss": 0.0646, "num_tokens": 1350304.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 83.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.025315262377262115, "kl": 0.0044570863246917725, "learning_rate": 3.047777777777778e-06, "loss": 0.0002, "num_tokens": 1350564.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 83.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.8641977310180664, "kl": 0.04405417665839195, "learning_rate": 3.0472222222222226e-06, "loss": 0.0889, "num_tokens": 1350898.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 83.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05570821464061737, "kl": 0.016374733299016953, "learning_rate": 3.0466666666666666e-06, "loss": 0.0008, "num_tokens": 1351233.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 83.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.04206191003322601, "kl": 0.016671850811690092, "learning_rate": 3.0461111111111114e-06, "loss": 0.0008, "num_tokens": 1351521.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 83.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.025011800229549408, "kl": 0.0067759412340819836, "learning_rate": 3.045555555555556e-06, "loss": 0.0003, "num_tokens": 1351817.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 83.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.020150313153862953, "kl": 0.011327409534715116, "learning_rate": 3.045e-06, "loss": 0.0006, "num_tokens": 1352104.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 83.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.061893898993730545, "kl": 0.006602793699130416, "learning_rate": 3.044444444444445e-06, "loss": 0.0003, "num_tokens": 1352366.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 4.140169620513916, "kl": 0.017480580136179924, "learning_rate": 3.043888888888889e-06, "loss": 0.0383, "num_tokens": 1352642.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 83.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.315798282623291, "kl": 0.13978025317192078, "learning_rate": 3.0433333333333336e-06, "loss": -0.0856, "num_tokens": 1352967.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 83.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.004219022113829851, "kl": 0.001197670353576541, "learning_rate": 3.042777777777778e-06, "loss": 0.0001, "num_tokens": 1353287.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 83.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.717437744140625, "kl": 0.147751085460186, "learning_rate": 3.0422222222222227e-06, "loss": 0.0253, "num_tokens": 1353631.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 4525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 83.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.033122703433036804, "kl": 0.004968646680936217, "learning_rate": 3.0416666666666666e-06, "loss": 0.0002, "num_tokens": 1353943.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008064515888690948, "clip_ratio/low_min": 0.008064515888690948, "clip_ratio/region_mean": 0.008064515888690948, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 83.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.6504006385803223, "kl": 0.2289542555809021, "learning_rate": 3.0411111111111114e-06, "loss": 0.2509, "num_tokens": 1354273.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 4527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 83.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03550137206912041, "kl": 0.0030156150460243225, "learning_rate": 3.040555555555556e-06, "loss": 0.0002, "num_tokens": 1354479.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 83.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.016926180571317673, "kl": 0.08522696048021317, "learning_rate": 3.04e-06, "loss": 0.0043, "num_tokens": 1354843.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 83.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05669012293219566, "kl": 0.020377177745103836, "learning_rate": 3.039444444444445e-06, "loss": 0.0011, "num_tokens": 1355088.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 83.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05865732580423355, "kl": 0.014190131798386574, "learning_rate": 3.038888888888889e-06, "loss": 0.0007, "num_tokens": 1355360.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 83.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.12122354656457901, "kl": 0.03808830492198467, "learning_rate": 3.0383333333333336e-06, "loss": 0.0019, "num_tokens": 1355699.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 83.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.128886878490448, "kl": 0.040256655775010586, "learning_rate": 3.037777777777778e-06, "loss": 0.0022, "num_tokens": 1356026.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 83.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05269838869571686, "kl": 0.020153080578893423, "learning_rate": 3.0372222222222223e-06, "loss": 0.001, "num_tokens": 1356317.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 83.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009723145631141961, "kl": 8.696069198776968e-05, "learning_rate": 3.0366666666666667e-06, "loss": 0.0, "num_tokens": 1356589.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 84.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.025941014289855957, "kl": 0.012182238977402449, "learning_rate": 3.0361111111111115e-06, "loss": 0.0006, "num_tokens": 1356849.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.12301137298345566, "kl": 0.017986350925639272, "learning_rate": 3.0355555555555562e-06, "loss": 0.0009, "num_tokens": 1357133.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 84.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03287510201334953, "kl": 0.06783517636358738, "learning_rate": 3.035e-06, "loss": 0.0034, "num_tokens": 1357502.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 84.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.055518440902233124, "kl": 0.03696542605757713, "learning_rate": 3.034444444444445e-06, "loss": 0.0019, "num_tokens": 1357772.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0342283733189106, "kl": 0.006662702886387706, "learning_rate": 3.033888888888889e-06, "loss": 0.0003, "num_tokens": 1358067.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 84.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.004318274091929197, "kl": 0.0011854649055749178, "learning_rate": 3.0333333333333337e-06, "loss": 0.0001, "num_tokens": 1358287.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 84.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07586181163787842, "kl": 0.10635990649461746, "learning_rate": 3.032777777777778e-06, "loss": 0.0054, "num_tokens": 1358611.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 84.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.1482784748077393, "kl": 0.038238413631916046, "learning_rate": 3.0322222222222224e-06, "loss": -0.0353, "num_tokens": 1359082.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 4543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 84.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009122500196099281, "kl": 0.0017772380961105227, "learning_rate": 3.0316666666666667e-06, "loss": 0.0001, "num_tokens": 1359362.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 84.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001672843936830759, "kl": 3.5114586353302e-05, "learning_rate": 3.0311111111111115e-06, "loss": 0.0, "num_tokens": 1359582.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 84.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.04549086466431618, "kl": 0.0014933156198821962, "learning_rate": 3.030555555555556e-06, "loss": 0.0001, "num_tokens": 1359816.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 84.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 1.5272784233093262, "kl": 0.09925772994756699, "learning_rate": 3.0300000000000002e-06, "loss": 0.0251, "num_tokens": 1360182.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 84.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08957552164793015, "kl": 0.07840996980667114, "learning_rate": 3.029444444444445e-06, "loss": 0.0037, "num_tokens": 1360496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.94936203956604, "kl": 0.05882025836035609, "learning_rate": 3.028888888888889e-06, "loss": 0.0681, "num_tokens": 1360788.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 4549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 84.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 4.677789211273193, "kl": 0.03848394379019737, "learning_rate": 3.0283333333333337e-06, "loss": -0.0811, "num_tokens": 1361107.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.08023674041032791, "kl": 0.01591406762599945, "learning_rate": 3.0277777777777776e-06, "loss": 0.0008, "num_tokens": 1361427.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 84.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.07670610398054123, "kl": 0.00787609035614878, "learning_rate": 3.0272222222222224e-06, "loss": 0.0004, "num_tokens": 1361717.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 84.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.014241606928408146, "kl": 0.004335261881351471, "learning_rate": 3.0266666666666668e-06, "loss": 0.0002, "num_tokens": 1361977.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 84.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.042785871773958206, "kl": 0.02084551937878132, "learning_rate": 3.026111111111111e-06, "loss": 0.001, "num_tokens": 1362309.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07846374064683914, "kl": 0.022185565903782845, "learning_rate": 3.025555555555556e-06, "loss": 0.0011, "num_tokens": 1362593.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 84.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05045301467180252, "kl": 0.018532151356339455, "learning_rate": 3.0250000000000003e-06, "loss": 0.001, "num_tokens": 1362838.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 84.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.12033317983150482, "kl": 0.028477382846176624, "learning_rate": 3.024444444444445e-06, "loss": 0.0014, "num_tokens": 1363102.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 84.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01563281938433647, "kl": 0.007985584437847137, "learning_rate": 3.023888888888889e-06, "loss": 0.0004, "num_tokens": 1363338.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 84.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.055868685245513916, "kl": 0.0057627959176898, "learning_rate": 3.0233333333333338e-06, "loss": 0.0003, "num_tokens": 1363596.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 84.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.02370368503034115, "kl": 0.0017425521509721875, "learning_rate": 3.0227777777777777e-06, "loss": 0.0001, "num_tokens": 1363884.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 84.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.4310804009437561, "kl": 0.06341608613729477, "learning_rate": 3.0222222222222225e-06, "loss": 0.0032, "num_tokens": 1364100.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 84.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.22603951394557953, "kl": 0.11948023736476898, "learning_rate": 3.021666666666667e-06, "loss": 0.0058, "num_tokens": 1364409.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 84.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.27396467328071594, "kl": 0.05147061962634325, "learning_rate": 3.021111111111111e-06, "loss": 0.0025, "num_tokens": 1364709.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 84.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05496816337108612, "kl": 0.13952144235372543, "learning_rate": 3.020555555555556e-06, "loss": 0.0069, "num_tokens": 1365020.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 84.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.025699526071548462, "kl": 0.012356439605355263, "learning_rate": 3.0200000000000003e-06, "loss": 0.0006, "num_tokens": 1365280.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 84.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.003335857531055808, "kl": 0.017119173891842365, "learning_rate": 3.0194444444444447e-06, "loss": 0.0009, "num_tokens": 1365592.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 84.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.918219566345215, "kl": 0.07538479566574097, "learning_rate": 3.018888888888889e-06, "loss": 0.2645, "num_tokens": 1365961.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 4567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 84.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.02974037453532219, "kl": 0.2421758770942688, "learning_rate": 3.018333333333334e-06, "loss": 0.012, "num_tokens": 1366261.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 84.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0070106410421431065, "kl": 0.002916488330811262, "learning_rate": 3.0177777777777777e-06, "loss": 0.0001, "num_tokens": 1366573.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 84.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.11421194672584534, "kl": 0.023529349826276302, "learning_rate": 3.0172222222222225e-06, "loss": 0.0013, "num_tokens": 1366914.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 84.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.2694904804229736, "kl": 0.3455301374197006, "learning_rate": 3.0166666666666673e-06, "loss": -0.1328, "num_tokens": 1367245.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 84.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.1349426656961441, "kl": 0.08262016624212265, "learning_rate": 3.0161111111111112e-06, "loss": 0.0043, "num_tokens": 1367584.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 84.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.007260896731168032, "kl": 0.001979529857635498, "learning_rate": 3.015555555555556e-06, "loss": 0.0001, "num_tokens": 1367796.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 84.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 4.668918132781982, "kl": 0.011141299037262797, "learning_rate": 3.0150000000000004e-06, "loss": 0.0209, "num_tokens": 1368093.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 84.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02478647790849209, "kl": 0.03322112001478672, "learning_rate": 3.0144444444444447e-06, "loss": 0.0017, "num_tokens": 1368385.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 84.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.061557233333587646, "kl": 0.004145428538322449, "learning_rate": 3.013888888888889e-06, "loss": 0.0002, "num_tokens": 1368598.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 84.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.045669328421354294, "kl": 0.0040769800543785095, "learning_rate": 3.013333333333334e-06, "loss": 0.0002, "num_tokens": 1368866.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 84.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.11252601444721222, "kl": 0.06566472724080086, "learning_rate": 3.0127777777777778e-06, "loss": 0.0033, "num_tokens": 1369138.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 84.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.11126439273357391, "kl": 0.00982298655435443, "learning_rate": 3.0122222222222226e-06, "loss": 0.0005, "num_tokens": 1369406.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 84.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.007773899473249912, "kl": 0.0001852894783951342, "learning_rate": 3.0116666666666673e-06, "loss": 0.0, "num_tokens": 1369676.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 84.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.038187041878700256, "kl": 0.0012098372899345122, "learning_rate": 3.0111111111111113e-06, "loss": 0.0001, "num_tokens": 1369932.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 84.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.2551963329315186, "kl": 0.05812861584126949, "learning_rate": 3.010555555555556e-06, "loss": -0.1595, "num_tokens": 1370234.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 4582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 84.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.521705687046051, "kl": 0.056664541363716125, "learning_rate": 3.01e-06, "loss": 0.0029, "num_tokens": 1370540.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 84.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.2179241180419922, "kl": 0.09035391919314861, "learning_rate": 3.0094444444444448e-06, "loss": -0.0147, "num_tokens": 1370879.0, "reward": 4.375, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 2.0966243743896484, "step": 4584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 84.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019337196135893464, "kl": 0.0011823612148873508, "learning_rate": 3.008888888888889e-06, "loss": 0.0001, "num_tokens": 1371200.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 84.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01036893855780363, "kl": 0.0004677453252952546, "learning_rate": 3.0083333333333335e-06, "loss": 0.0, "num_tokens": 1371480.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 84.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.022539842873811722, "kl": 0.02156743034720421, "learning_rate": 3.007777777777778e-06, "loss": 0.0011, "num_tokens": 1371892.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 84.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04974540323019028, "kl": 0.004663419735152274, "learning_rate": 3.0072222222222226e-06, "loss": 0.0002, "num_tokens": 1372146.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 84.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.20773929357528687, "kl": 0.04990058019757271, "learning_rate": 3.0066666666666674e-06, "loss": 0.0027, "num_tokens": 1372468.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.016701817512512207, "kl": 0.0013169050216674805, "learning_rate": 3.0061111111111113e-06, "loss": 0.0001, "num_tokens": 1372680.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 85.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.038933683186769485, "kl": 0.013114158529788256, "learning_rate": 3.005555555555556e-06, "loss": 0.0006, "num_tokens": 1373016.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 85.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.31591278314590454, "kl": 0.08367208205163479, "learning_rate": 3.005e-06, "loss": 0.0053, "num_tokens": 1373319.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 85.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.028873898088932037, "kl": 0.011638366151601076, "learning_rate": 3.004444444444445e-06, "loss": 0.0006, "num_tokens": 1373579.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 85.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.01782539300620556, "kl": 0.0016463398933410645, "learning_rate": 3.003888888888889e-06, "loss": 0.0001, "num_tokens": 1373791.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 85.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04772656410932541, "kl": 0.016796916257590055, "learning_rate": 3.0033333333333335e-06, "loss": 0.0008, "num_tokens": 1374085.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 85.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.013737705536186695, "kl": 0.2541238144040108, "learning_rate": 3.002777777777778e-06, "loss": 0.0127, "num_tokens": 1374383.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.01620575226843357, "kl": 0.007732383906841278, "learning_rate": 3.0022222222222227e-06, "loss": 0.0004, "num_tokens": 1374619.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 85.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07039337605237961, "kl": 0.017725318670272827, "learning_rate": 3.001666666666667e-06, "loss": 0.0009, "num_tokens": 1374937.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 85.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.022466151043772697, "kl": 0.0007188206654973328, "learning_rate": 3.0011111111111114e-06, "loss": 0.0, "num_tokens": 1375172.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 85.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.009564748033881187, "kl": 0.04760884679853916, "learning_rate": 3.000555555555556e-06, "loss": 0.0024, "num_tokens": 1375638.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 85.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.27863237261772156, "kl": 0.19008278846740723, "learning_rate": 3e-06, "loss": 0.0095, "num_tokens": 1375975.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 85.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06715232878923416, "kl": 0.008003470953553915, "learning_rate": 2.999444444444445e-06, "loss": 0.0004, "num_tokens": 1376277.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.19314059615135193, "kl": 0.05127967707812786, "learning_rate": 2.9988888888888888e-06, "loss": 0.0026, "num_tokens": 1376541.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 85.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006725996499881148, "kl": 0.0017560379346832633, "learning_rate": 2.9983333333333336e-06, "loss": 0.0001, "num_tokens": 1376821.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 85.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03814643621444702, "kl": 0.010511867003515363, "learning_rate": 2.997777777777778e-06, "loss": 0.0005, "num_tokens": 1377101.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 85.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.661592721939087, "kl": 0.06253781728446484, "learning_rate": 2.9972222222222223e-06, "loss": 0.0796, "num_tokens": 1377410.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 85.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 6.706676959991455, "kl": 0.034616378718055785, "learning_rate": 2.996666666666667e-06, "loss": 0.2254, "num_tokens": 1377638.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 85.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04147358611226082, "kl": 0.10364893078804016, "learning_rate": 2.9961111111111114e-06, "loss": 0.0053, "num_tokens": 1377961.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 85.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.013744580559432507, "kl": 0.0027629851829260588, "learning_rate": 2.995555555555556e-06, "loss": 0.0001, "num_tokens": 1378217.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 85.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.6973876953125, "kl": 0.1060846820473671, "learning_rate": 2.995e-06, "loss": 0.0341, "num_tokens": 1378525.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 85.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.012705925852060318, "kl": 0.0013490871351677924, "learning_rate": 2.994444444444445e-06, "loss": 0.0001, "num_tokens": 1378795.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 85.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05200381577014923, "kl": 0.015233179554343224, "learning_rate": 2.993888888888889e-06, "loss": 0.0008, "num_tokens": 1379120.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 85.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.33216407895088196, "kl": 0.04542376101016998, "learning_rate": 2.9933333333333336e-06, "loss": 0.0026, "num_tokens": 1379384.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 85.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11330398917198181, "kl": 0.040161858312785625, "learning_rate": 2.992777777777778e-06, "loss": 0.0017, "num_tokens": 1379767.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 85.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04747876524925232, "kl": 0.0235928101465106, "learning_rate": 2.9922222222222223e-06, "loss": 0.0012, "num_tokens": 1380057.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 85.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.032450977712869644, "kl": 0.07259627431631088, "learning_rate": 2.991666666666667e-06, "loss": 0.0036, "num_tokens": 1380419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 85.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.12990929186344147, "kl": 0.04800053499639034, "learning_rate": 2.9911111111111115e-06, "loss": 0.0025, "num_tokens": 1380774.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026029839646071196, "kl": 0.003398702945560217, "learning_rate": 2.990555555555556e-06, "loss": 0.0002, "num_tokens": 1381070.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 85.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.21966500580310822, "kl": 0.03337015677243471, "learning_rate": 2.99e-06, "loss": 0.0016, "num_tokens": 1381383.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 85.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.007108363322913647, "kl": 0.0028311953647062182, "learning_rate": 2.989444444444445e-06, "loss": 0.0001, "num_tokens": 1381667.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 85.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05242834612727165, "kl": 0.0025136545300483704, "learning_rate": 2.988888888888889e-06, "loss": 0.0001, "num_tokens": 1381877.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 85.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06212785467505455, "kl": 0.027070730924606323, "learning_rate": 2.9883333333333337e-06, "loss": 0.0014, "num_tokens": 1382122.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 85.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.6086010932922363, "kl": 0.11424067243933678, "learning_rate": 2.9877777777777776e-06, "loss": 0.0057, "num_tokens": 1382434.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10005002468824387, "kl": 0.049685386940836906, "learning_rate": 2.9872222222222224e-06, "loss": 0.0026, "num_tokens": 1382706.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 85.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02054181508719921, "kl": 0.0010020470508607104, "learning_rate": 2.986666666666667e-06, "loss": 0.0, "num_tokens": 1382984.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.028614580631256104, "kl": 0.010389144532382488, "learning_rate": 2.986111111111111e-06, "loss": 0.0006, "num_tokens": 1383272.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 85.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07925610989332199, "kl": 0.019715266302227974, "learning_rate": 2.985555555555556e-06, "loss": 0.001, "num_tokens": 1383590.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.4721038043498993, "kl": 0.05469772219657898, "learning_rate": 2.9850000000000002e-06, "loss": 0.003, "num_tokens": 1383803.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 85.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02353290095925331, "kl": 0.004566412419080734, "learning_rate": 2.984444444444445e-06, "loss": 0.0002, "num_tokens": 1384063.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 85.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.012398525141179562, "kl": 0.00020294786372687668, "learning_rate": 2.983888888888889e-06, "loss": 0.0, "num_tokens": 1384319.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 85.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.6255937814712524, "kl": 0.12887411564588547, "learning_rate": 2.9833333333333337e-06, "loss": 0.0067, "num_tokens": 1384592.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 85.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.049242787063121796, "kl": 0.006651653908193111, "learning_rate": 2.9827777777777776e-06, "loss": 0.0003, "num_tokens": 1384894.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 85.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.08459031581878662, "kl": 0.007837135810405016, "learning_rate": 2.9822222222222224e-06, "loss": 0.0004, "num_tokens": 1385158.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 85.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.003950245212763548, "kl": 0.001286353392060846, "learning_rate": 2.981666666666667e-06, "loss": 0.0001, "num_tokens": 1385474.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 85.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.7973153591156006, "kl": 0.16696340590715408, "learning_rate": 2.981111111111111e-06, "loss": -0.0226, "num_tokens": 1385780.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 85.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.3298001289367676, "kl": 0.06880737468600273, "learning_rate": 2.980555555555556e-06, "loss": 0.3085, "num_tokens": 1386157.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 4636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 85.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08919786661863327, "kl": 0.10946996882557869, "learning_rate": 2.9800000000000003e-06, "loss": 0.0055, "num_tokens": 1386493.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07147003710269928, "kl": 0.03582633286714554, "learning_rate": 2.9794444444444446e-06, "loss": 0.0018, "num_tokens": 1386709.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 85.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 0.44941261410713196, "kl": 0.23167582601308823, "learning_rate": 2.978888888888889e-06, "loss": 0.0117, "num_tokens": 1387073.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 85.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.24988330900669098, "kl": 0.03183523565530777, "learning_rate": 2.9783333333333338e-06, "loss": 0.0016, "num_tokens": 1387401.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 85.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001567662402521819, "kl": 3.9555132389068604e-05, "learning_rate": 2.9777777777777777e-06, "loss": 0.0, "num_tokens": 1387621.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 85.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.022786375135183334, "kl": 0.006366457149852067, "learning_rate": 2.9772222222222225e-06, "loss": 0.0003, "num_tokens": 1387909.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 85.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.11447670310735703, "kl": 0.011651043547317386, "learning_rate": 2.9766666666666672e-06, "loss": 0.0006, "num_tokens": 1388174.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 86.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.28882259130477905, "kl": 0.05915684625506401, "learning_rate": 2.976111111111111e-06, "loss": 0.0033, "num_tokens": 1388522.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 86.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.17076463997364044, "kl": 0.055276280269026756, "learning_rate": 2.975555555555556e-06, "loss": 0.0031, "num_tokens": 1388802.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 86.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00882179755717516, "kl": 0.0011693552369251847, "learning_rate": 2.9750000000000003e-06, "loss": 0.0001, "num_tokens": 1389124.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 86.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.19954288005828857, "kl": 0.049115548899862915, "learning_rate": 2.9744444444444447e-06, "loss": 0.0021, "num_tokens": 1389428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 86.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.062290024012327194, "kl": 0.005740372929722071, "learning_rate": 2.973888888888889e-06, "loss": 0.0003, "num_tokens": 1389693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 86.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.520658016204834, "kl": 0.07425734028220177, "learning_rate": 2.973333333333334e-06, "loss": 0.0077, "num_tokens": 1390079.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 86.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.7854065895080566, "kl": 0.05186654254794121, "learning_rate": 2.9727777777777777e-06, "loss": 0.0034, "num_tokens": 1390363.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 86.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03826197609305382, "kl": 0.011535762343555689, "learning_rate": 2.9722222222222225e-06, "loss": 0.0006, "num_tokens": 1390690.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 86.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011846239678561687, "kl": 0.003221810795366764, "learning_rate": 2.9716666666666673e-06, "loss": 0.0001, "num_tokens": 1390958.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 86.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.0649430751800537, "kl": 0.06479782052338123, "learning_rate": 2.9711111111111112e-06, "loss": 0.1025, "num_tokens": 1391284.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 86.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.011892658658325672, "kl": 0.03289969265460968, "learning_rate": 2.970555555555556e-06, "loss": 0.0016, "num_tokens": 1391500.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 86.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.11775867640972137, "kl": 0.02064973535016179, "learning_rate": 2.97e-06, "loss": 0.001, "num_tokens": 1391801.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 86.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 1.2370116710662842, "kl": 0.15087277814745903, "learning_rate": 2.9694444444444447e-06, "loss": -0.0246, "num_tokens": 1392158.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 86.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 3.5314090251922607, "kl": 0.14071294479072094, "learning_rate": 2.968888888888889e-06, "loss": 0.0099, "num_tokens": 1392442.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 86.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.004447453189641237, "kl": 0.00404728576540947, "learning_rate": 2.9683333333333334e-06, "loss": 0.0002, "num_tokens": 1392702.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 86.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.7994934320449829, "kl": 0.18349428288638592, "learning_rate": 2.9677777777777778e-06, "loss": 0.0092, "num_tokens": 1392999.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 86.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 5.165142059326172, "kl": 0.0012097656726837158, "learning_rate": 2.9672222222222226e-06, "loss": 0.0033, "num_tokens": 1393243.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 86.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02222989872097969, "kl": 0.14950066804885864, "learning_rate": 2.9666666666666673e-06, "loss": 0.0073, "num_tokens": 1393560.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 86.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015662022633478045, "kl": 3.901869058609009e-05, "learning_rate": 2.9661111111111113e-06, "loss": 0.0, "num_tokens": 1393780.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 86.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03344927355647087, "kl": 0.013890162110328674, "learning_rate": 2.965555555555556e-06, "loss": 0.0007, "num_tokens": 1394114.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 86.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.3009037971496582, "kl": 0.17956674098968506, "learning_rate": 2.965e-06, "loss": 0.009, "num_tokens": 1394428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 86.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.028736766427755356, "kl": 0.0018278445350006223, "learning_rate": 2.9644444444444448e-06, "loss": 0.0001, "num_tokens": 1394662.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 86.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.022831227630376816, "kl": 0.0009261046652682126, "learning_rate": 2.963888888888889e-06, "loss": 0.0, "num_tokens": 1394932.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 86.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.09896409511566162, "kl": 0.011930698732612655, "learning_rate": 2.9633333333333335e-06, "loss": 0.0005, "num_tokens": 1395241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 86.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.08063303679227829, "kl": 0.020269401371479034, "learning_rate": 2.962777777777778e-06, "loss": 0.001, "num_tokens": 1395514.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 86.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.024338051676750183, "kl": 0.006731610279530287, "learning_rate": 2.9622222222222226e-06, "loss": 0.0003, "num_tokens": 1395802.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 86.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.050773218274116516, "kl": 0.008405090775340796, "learning_rate": 2.961666666666667e-06, "loss": 0.0004, "num_tokens": 1396082.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 86.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.13919511437416077, "kl": 0.1199696809053421, "learning_rate": 2.9611111111111113e-06, "loss": 0.0061, "num_tokens": 1396435.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 86.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07626207917928696, "kl": 0.03480422869324684, "learning_rate": 2.960555555555556e-06, "loss": 0.0017, "num_tokens": 1396848.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 86.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.012155018746852875, "kl": 0.014470145106315613, "learning_rate": 2.96e-06, "loss": 0.0007, "num_tokens": 1397160.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 86.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.13865868747234344, "kl": 0.029581869021058083, "learning_rate": 2.959444444444445e-06, "loss": 0.0015, "num_tokens": 1397422.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 86.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11135158687829971, "kl": 0.023223457857966423, "learning_rate": 2.9588888888888887e-06, "loss": 0.0012, "num_tokens": 1397750.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 86.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.060112114995718, "kl": 0.012149238216807134, "learning_rate": 2.9583333333333335e-06, "loss": 0.0006, "num_tokens": 1398006.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 86.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.6456005573272705, "kl": 0.011419904418289661, "learning_rate": 2.957777777777778e-06, "loss": -0.0008, "num_tokens": 1398322.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 4677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 86.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.14982439577579498, "kl": 0.010286005970556289, "learning_rate": 2.9572222222222222e-06, "loss": 0.0006, "num_tokens": 1398544.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 86.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04200293868780136, "kl": 0.02497793221846223, "learning_rate": 2.956666666666667e-06, "loss": 0.0012, "num_tokens": 1398832.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 86.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.12991224229335785, "kl": 0.04377650562673807, "learning_rate": 2.9561111111111114e-06, "loss": 0.0024, "num_tokens": 1399104.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4680 }, { "clip_ratio/high_max": 0.006756756920367479, "clip_ratio/high_mean": 0.006756756920367479, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006756756920367479, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 86.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.065244674682617, "kl": 0.07968329824507236, "learning_rate": 2.955555555555556e-06, "loss": 0.1229, "num_tokens": 1399484.0, "reward": 6.5, "reward_std": 2.345207929611206, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.345207929611206, "step": 4681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 86.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 3.4317030906677246, "kl": 0.11838197335600853, "learning_rate": 2.955e-06, "loss": 0.0883, "num_tokens": 1399816.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 4682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 86.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05378296226263046, "kl": 0.0033472084905952215, "learning_rate": 2.954444444444445e-06, "loss": 0.0002, "num_tokens": 1400088.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 86.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.006415011361241341, "kl": 0.0005563646554946899, "learning_rate": 2.953888888888889e-06, "loss": 0.0, "num_tokens": 1400300.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 86.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04083802551031113, "kl": 0.005748944357037544, "learning_rate": 2.9533333333333336e-06, "loss": 0.0003, "num_tokens": 1400600.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 86.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.006633398123085499, "kl": 0.002730660722590983, "learning_rate": 2.952777777777778e-06, "loss": 0.0001, "num_tokens": 1400884.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 86.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.002700802870094776, "kl": 0.09021953120827675, "learning_rate": 2.9522222222222223e-06, "loss": 0.0045, "num_tokens": 1401248.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 86.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.1781526505947113, "kl": 0.18526531755924225, "learning_rate": 2.951666666666667e-06, "loss": 0.0093, "num_tokens": 1401606.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 86.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.06187942251563072, "kl": 0.015011295210570097, "learning_rate": 2.9511111111111114e-06, "loss": 0.0008, "num_tokens": 1401877.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 86.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.013012576848268509, "kl": 0.008460916578769684, "learning_rate": 2.9505555555555558e-06, "loss": 0.0004, "num_tokens": 1402113.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 86.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.25669631361961365, "kl": 0.07490886375308037, "learning_rate": 2.95e-06, "loss": 0.0037, "num_tokens": 1402565.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 86.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.16009359061717987, "kl": 0.010554888751357794, "learning_rate": 2.949444444444445e-06, "loss": 0.0005, "num_tokens": 1402819.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 86.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.3219102621078491, "kl": 0.04025172255933285, "learning_rate": 2.948888888888889e-06, "loss": 0.002, "num_tokens": 1403079.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 86.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08863499760627747, "kl": 0.004290074110031128, "learning_rate": 2.9483333333333336e-06, "loss": 0.0002, "num_tokens": 1403289.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 86.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.4470460414886475, "kl": 0.07110100984573364, "learning_rate": 2.9477777777777784e-06, "loss": 0.0158, "num_tokens": 1403593.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 86.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03276413679122925, "kl": 0.0009160041809082031, "learning_rate": 2.9472222222222223e-06, "loss": 0.0, "num_tokens": 1403805.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 86.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04531927779316902, "kl": 0.013556239195168018, "learning_rate": 2.946666666666667e-06, "loss": 0.0007, "num_tokens": 1404136.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 87.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.016734827309846878, "kl": 0.24029728770256042, "learning_rate": 2.946111111111111e-06, "loss": 0.0119, "num_tokens": 1404436.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.011363636702299118, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 87.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.4707648754119873, "kl": 0.07947263866662979, "learning_rate": 2.945555555555556e-06, "loss": -0.0764, "num_tokens": 1404741.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 87.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09022119641304016, "kl": 0.017233734019100666, "learning_rate": 2.945e-06, "loss": 0.0009, "num_tokens": 1405070.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 87.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017915189964696765, "kl": 0.0033990279771387577, "learning_rate": 2.944444444444445e-06, "loss": 0.0002, "num_tokens": 1405366.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 87.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.08762360364198685, "kl": 0.05483429133892059, "learning_rate": 2.943888888888889e-06, "loss": 0.0027, "num_tokens": 1405824.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.653949022293091, "kl": 0.14190537109971046, "learning_rate": 2.9433333333333337e-06, "loss": -0.0228, "num_tokens": 1406122.0, "reward": 4.875, "reward_std": 3.1983067989349365, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.1983067989349365, "step": 4703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 87.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.9542694091796875, "kl": 0.025681382045149803, "learning_rate": 2.9427777777777784e-06, "loss": -0.0591, "num_tokens": 1406410.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 87.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.014459287747740746, "kl": 0.2271270900964737, "learning_rate": 2.9422222222222224e-06, "loss": 0.0113, "num_tokens": 1406712.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 87.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05562114343047142, "kl": 0.023562216199934483, "learning_rate": 2.941666666666667e-06, "loss": 0.0012, "num_tokens": 1407028.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 87.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.250985324382782, "kl": 0.04286300390958786, "learning_rate": 2.941111111111111e-06, "loss": 0.0023, "num_tokens": 1407273.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 87.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.6947922706604, "kl": 0.07892781344708055, "learning_rate": 2.940555555555556e-06, "loss": 0.0024, "num_tokens": 1407533.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 87.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.05123851075768471, "kl": 0.00554554583504796, "learning_rate": 2.9400000000000002e-06, "loss": 0.0003, "num_tokens": 1407758.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 87.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03851330280303955, "kl": 0.022376226261258125, "learning_rate": 2.9394444444444446e-06, "loss": 0.0011, "num_tokens": 1408089.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 87.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.06303104758262634, "kl": 0.014918883331120014, "learning_rate": 2.938888888888889e-06, "loss": 0.0007, "num_tokens": 1408357.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 87.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04342436417937279, "kl": 0.004192456952296197, "learning_rate": 2.9383333333333337e-06, "loss": 0.0002, "num_tokens": 1408670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 52.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 87.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.277365207672119, "kl": 0.17141198366880417, "learning_rate": 2.937777777777778e-06, "loss": 0.3052, "num_tokens": 1409102.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 4713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 87.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.050372034311294556, "kl": 0.0027555361157283187, "learning_rate": 2.9372222222222224e-06, "loss": 0.0001, "num_tokens": 1409360.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 87.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.00478219473734498, "kl": 0.00038710024091415107, "learning_rate": 2.936666666666667e-06, "loss": 0.0, "num_tokens": 1409595.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 87.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.012379170395433903, "kl": 0.008572518825531006, "learning_rate": 2.936111111111111e-06, "loss": 0.0004, "num_tokens": 1409831.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 87.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09977451711893082, "kl": 0.14758621156215668, "learning_rate": 2.935555555555556e-06, "loss": 0.0078, "num_tokens": 1410157.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 87.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.028112489730119705, "kl": 0.01130242319777608, "learning_rate": 2.9350000000000003e-06, "loss": 0.0006, "num_tokens": 1410490.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 87.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01376794371753931, "kl": 0.013981487601995468, "learning_rate": 2.9344444444444446e-06, "loss": 0.0007, "num_tokens": 1410802.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 87.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.037253204733133316, "kl": 0.007419012952595949, "learning_rate": 2.933888888888889e-06, "loss": 0.0004, "num_tokens": 1411110.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 87.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.013447781093418598, "kl": 0.00022142827219795436, "learning_rate": 2.9333333333333338e-06, "loss": 0.0, "num_tokens": 1411366.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 1.771054983139038, "kl": 0.05834909179247916, "learning_rate": 2.932777777777778e-06, "loss": 0.0688, "num_tokens": 1411652.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 87.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.16534943878650665, "kl": 0.07724739611148834, "learning_rate": 2.9322222222222225e-06, "loss": 0.0038, "num_tokens": 1411977.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 87.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.037924982607364655, "kl": 0.029086475260555744, "learning_rate": 2.9316666666666673e-06, "loss": 0.0014, "num_tokens": 1412365.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 87.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.809920072555542, "kl": 0.059070393443107605, "learning_rate": 2.931111111111111e-06, "loss": 0.0076, "num_tokens": 1412744.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 4725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 87.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 11.275504112243652, "kl": 0.03502184897661209, "learning_rate": 2.930555555555556e-06, "loss": 0.3325, "num_tokens": 1412958.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 87.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.012540834955871105, "kl": 0.00417017936706543, "learning_rate": 2.93e-06, "loss": 0.0002, "num_tokens": 1413218.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 87.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.016557864844799042, "kl": 0.0015445875469595194, "learning_rate": 2.9294444444444447e-06, "loss": 0.0001, "num_tokens": 1413538.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.026183033362030983, "kl": 0.004388433299027383, "learning_rate": 2.928888888888889e-06, "loss": 0.0002, "num_tokens": 1413833.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 87.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.027727415785193443, "kl": 0.011784444097429514, "learning_rate": 2.9283333333333334e-06, "loss": 0.0006, "num_tokens": 1414093.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 87.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.024089889600872993, "kl": 0.003511048387736082, "learning_rate": 2.927777777777778e-06, "loss": 0.0002, "num_tokens": 1414363.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 87.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.0324866771698, "kl": 0.03121026512235403, "learning_rate": 2.9272222222222225e-06, "loss": -0.1611, "num_tokens": 1414718.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 87.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.007049939129501581, "kl": 0.0005720749613828957, "learning_rate": 2.9266666666666673e-06, "loss": 0.0, "num_tokens": 1414937.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 87.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.08976706117391586, "kl": 0.15128809213638306, "learning_rate": 2.9261111111111112e-06, "loss": 0.0074, "num_tokens": 1415257.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 87.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07060734927654266, "kl": 0.03419820964336395, "learning_rate": 2.925555555555556e-06, "loss": 0.0017, "num_tokens": 1415476.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.048182591795921326, "kl": 0.013229423202574253, "learning_rate": 2.925e-06, "loss": 0.0007, "num_tokens": 1415765.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 87.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 1.4381402730941772, "kl": 0.18985315039753914, "learning_rate": 2.9244444444444447e-06, "loss": 0.007, "num_tokens": 1416108.0, "reward": 5.5, "reward_std": 2.309401035308838, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 2.309401035308838, "step": 4737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 87.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.3523292541503906, "kl": 0.024619970936328173, "learning_rate": 2.923888888888889e-06, "loss": 0.066, "num_tokens": 1416443.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 87.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.05702238157391548, "kl": 0.002585303271189332, "learning_rate": 2.9233333333333334e-06, "loss": 0.0001, "num_tokens": 1416709.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 87.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02266845852136612, "kl": 0.004306337097659707, "learning_rate": 2.9227777777777782e-06, "loss": 0.0002, "num_tokens": 1416993.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 87.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.4284440577030182, "kl": 0.14381437748670578, "learning_rate": 2.9222222222222226e-06, "loss": 0.0069, "num_tokens": 1417351.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 87.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.09735295921564102, "kl": 0.01756682712584734, "learning_rate": 2.921666666666667e-06, "loss": 0.0009, "num_tokens": 1417623.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 87.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.016694726422429085, "kl": 0.00410560192540288, "learning_rate": 2.9211111111111113e-06, "loss": 0.0002, "num_tokens": 1417911.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 87.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.059683576226234436, "kl": 0.02838539518415928, "learning_rate": 2.920555555555556e-06, "loss": 0.0014, "num_tokens": 1418201.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 87.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014023702533449978, "kl": 4.7497451305389404e-05, "learning_rate": 2.92e-06, "loss": 0.0, "num_tokens": 1418421.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 87.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.035626936703920364, "kl": 0.0006078630685806274, "learning_rate": 2.9194444444444448e-06, "loss": 0.0, "num_tokens": 1418633.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 87.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.28181371092796326, "kl": 0.020014163106679916, "learning_rate": 2.9188888888888887e-06, "loss": 0.001, "num_tokens": 1418895.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 87.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022696233354508877, "kl": 0.09039243310689926, "learning_rate": 2.9183333333333335e-06, "loss": 0.0045, "num_tokens": 1419259.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 87.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.14751267433166504, "kl": 0.03715147450566292, "learning_rate": 2.9177777777777783e-06, "loss": 0.0018, "num_tokens": 1419527.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 87.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.027171365916728973, "kl": 0.0029862994560971856, "learning_rate": 2.917222222222222e-06, "loss": 0.0001, "num_tokens": 1419805.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 87.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02792362868785858, "kl": 0.004652512026950717, "learning_rate": 2.916666666666667e-06, "loss": 0.0002, "num_tokens": 1420087.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 88.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.07312694936990738, "kl": 0.022932998836040497, "learning_rate": 2.9161111111111113e-06, "loss": 0.0011, "num_tokens": 1420387.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 88.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.12531623244285583, "kl": 0.025869976729154587, "learning_rate": 2.915555555555556e-06, "loss": 0.0012, "num_tokens": 1420672.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 88.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.3286682367324829, "kl": 0.07063721120357513, "learning_rate": 2.915e-06, "loss": 0.0032, "num_tokens": 1420974.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 88.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.14892266690731049, "kl": 0.028407561592757702, "learning_rate": 2.914444444444445e-06, "loss": 0.0015, "num_tokens": 1421268.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 88.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.5029163360595703, "kl": 0.04192948713898659, "learning_rate": 2.9138888888888888e-06, "loss": 0.0336, "num_tokens": 1421603.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 88.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.015363860875368118, "kl": 0.004083938314579427, "learning_rate": 2.9133333333333335e-06, "loss": 0.0002, "num_tokens": 1421887.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 88.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.49079465866088867, "kl": 0.19855720549821854, "learning_rate": 2.9127777777777783e-06, "loss": 0.0107, "num_tokens": 1422235.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 88.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05460001155734062, "kl": 0.008670082315802574, "learning_rate": 2.9122222222222222e-06, "loss": 0.0004, "num_tokens": 1422525.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 88.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.009502959437668324, "kl": 0.004245225572958589, "learning_rate": 2.911666666666667e-06, "loss": 0.0002, "num_tokens": 1422789.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 88.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.029826149344444275, "kl": 0.0018151641124859452, "learning_rate": 2.9111111111111114e-06, "loss": 0.0001, "num_tokens": 1423043.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 88.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.06770951300859451, "kl": 0.09318256378173828, "learning_rate": 2.9105555555555557e-06, "loss": 0.0046, "num_tokens": 1423347.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 88.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03720027208328247, "kl": 0.0016014724387787282, "learning_rate": 2.91e-06, "loss": 0.0001, "num_tokens": 1423603.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 88.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.015268915332853794, "kl": 0.0327039984986186, "learning_rate": 2.909444444444445e-06, "loss": 0.0017, "num_tokens": 1423894.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.5, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 88.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.07731977105140686, "kl": 0.027134111151099205, "learning_rate": 2.908888888888889e-06, "loss": 0.0014, "num_tokens": 1424316.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 88.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024457403924316168, "kl": 0.09034114703536034, "learning_rate": 2.9083333333333336e-06, "loss": 0.0045, "num_tokens": 1424680.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 88.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07492563873529434, "kl": 0.019456918351352215, "learning_rate": 2.9077777777777784e-06, "loss": 0.001, "num_tokens": 1425010.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 88.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.23334547877311707, "kl": 0.05495645321207121, "learning_rate": 2.9072222222222223e-06, "loss": 0.0027, "num_tokens": 1425287.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 88.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.852785348892212, "kl": 0.2575681209564209, "learning_rate": 2.906666666666667e-06, "loss": 0.0363, "num_tokens": 1425622.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 4769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 88.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.012982835993170738, "kl": 0.008487068116664886, "learning_rate": 2.9061111111111114e-06, "loss": 0.0004, "num_tokens": 1425858.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 88.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012905363691970706, "kl": 0.0017437615315429866, "learning_rate": 2.9055555555555558e-06, "loss": 0.0001, "num_tokens": 1426138.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 88.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02561625838279724, "kl": 0.002091570175252855, "learning_rate": 2.905e-06, "loss": 0.0001, "num_tokens": 1426408.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4772 }, { "clip_ratio/high_max": 0.00561797758564353, "clip_ratio/high_mean": 0.00561797758564353, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00561797758564353, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 88.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.8862621784210205, "kl": 0.06079212762415409, "learning_rate": 2.904444444444445e-06, "loss": 0.108, "num_tokens": 1426870.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 4773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 88.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.031228171661496162, "kl": 0.001300507748965174, "learning_rate": 2.903888888888889e-06, "loss": 0.0001, "num_tokens": 1427134.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4774 }, { "clip_ratio/high_max": 0.00909090880304575, "clip_ratio/high_mean": 0.00909090880304575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00909090880304575, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 88.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.9033353328704834, "kl": 0.07222787663340569, "learning_rate": 2.9033333333333336e-06, "loss": -0.0048, "num_tokens": 1427435.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 4775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 88.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.003792237024754286, "kl": 0.0007859170436859131, "learning_rate": 2.9027777777777784e-06, "loss": 0.0, "num_tokens": 1427655.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 88.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 6.5693583488464355, "kl": 1.081983521580696, "learning_rate": 2.9022222222222223e-06, "loss": 0.0533, "num_tokens": 1427998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 88.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09851040691137314, "kl": 0.08761581033468246, "learning_rate": 2.901666666666667e-06, "loss": 0.0044, "num_tokens": 1428364.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 88.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001394259015796706, "kl": 4.8995018005371094e-05, "learning_rate": 2.901111111111111e-06, "loss": 0.0, "num_tokens": 1428584.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 88.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.014954773709177971, "kl": 0.001524886058177799, "learning_rate": 2.900555555555556e-06, "loss": 0.0001, "num_tokens": 1428909.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 88.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07714271545410156, "kl": 0.022608079947531223, "learning_rate": 2.9e-06, "loss": 0.0011, "num_tokens": 1429209.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 88.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.030340025201439857, "kl": 0.004566755145788193, "learning_rate": 2.8994444444444445e-06, "loss": 0.0002, "num_tokens": 1429469.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 88.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.4796454906463623, "kl": 0.20121292769908905, "learning_rate": 2.898888888888889e-06, "loss": 0.0312, "num_tokens": 1429811.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 88.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.010652676224708557, "kl": 0.0020627565681934357, "learning_rate": 2.8983333333333337e-06, "loss": 0.0001, "num_tokens": 1430055.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 88.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.027461573481559753, "kl": 0.0013913922011852264, "learning_rate": 2.8977777777777785e-06, "loss": 0.0001, "num_tokens": 1430309.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 88.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08864323049783707, "kl": 0.03583947941660881, "learning_rate": 2.8972222222222224e-06, "loss": 0.0018, "num_tokens": 1430528.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 88.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05400221422314644, "kl": 0.0027345533890184015, "learning_rate": 2.896666666666667e-06, "loss": 0.0001, "num_tokens": 1430762.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 88.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.10113304853439331, "kl": 0.0053087323904037476, "learning_rate": 2.896111111111111e-06, "loss": 0.0002, "num_tokens": 1430972.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 88.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.01172309648245573, "kl": 0.0035212613875046372, "learning_rate": 2.895555555555556e-06, "loss": 0.0002, "num_tokens": 1431260.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 88.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.019944973289966583, "kl": 0.0065510914428159595, "learning_rate": 2.8950000000000002e-06, "loss": 0.0003, "num_tokens": 1431558.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 88.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.2927982807159424, "kl": 0.23042084369808435, "learning_rate": 2.8944444444444446e-06, "loss": 0.0118, "num_tokens": 1431879.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 88.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03488808125257492, "kl": 0.0032414005836471915, "learning_rate": 2.893888888888889e-06, "loss": 0.0002, "num_tokens": 1432177.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 88.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07558346539735794, "kl": 0.0596600566059351, "learning_rate": 2.8933333333333337e-06, "loss": 0.0029, "num_tokens": 1432484.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 88.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0092270253226161, "kl": 0.24015343189239502, "learning_rate": 2.892777777777778e-06, "loss": 0.0119, "num_tokens": 1432784.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 88.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.00746908551082015, "kl": 0.00029680877923965454, "learning_rate": 2.8922222222222224e-06, "loss": 0.0, "num_tokens": 1432996.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 88.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.053533539175987244, "kl": 0.0201434176415205, "learning_rate": 2.8916666666666672e-06, "loss": 0.001, "num_tokens": 1433315.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 88.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05619129166007042, "kl": 0.01028800057247281, "learning_rate": 2.891111111111111e-06, "loss": 0.0005, "num_tokens": 1433587.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 88.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09255844354629517, "kl": 0.03378046117722988, "learning_rate": 2.890555555555556e-06, "loss": 0.0019, "num_tokens": 1433868.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 88.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.9590427875518799, "kl": 0.031361598521471024, "learning_rate": 2.89e-06, "loss": -0.1765, "num_tokens": 1434226.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 88.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.012514932081103325, "kl": 0.014358732849359512, "learning_rate": 2.8894444444444446e-06, "loss": 0.0007, "num_tokens": 1434538.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 88.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07641027867794037, "kl": 0.1384682133793831, "learning_rate": 2.888888888888889e-06, "loss": 0.0069, "num_tokens": 1434850.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 88.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.0306077003479004, "kl": 0.12683709524571896, "learning_rate": 2.8883333333333333e-06, "loss": 0.0658, "num_tokens": 1435161.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 4802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 88.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05065544694662094, "kl": 0.02036858070641756, "learning_rate": 2.887777777777778e-06, "loss": 0.001, "num_tokens": 1435429.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 88.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03364666551351547, "kl": 0.010439134202897549, "learning_rate": 2.8872222222222225e-06, "loss": 0.0005, "num_tokens": 1435689.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 88.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.12728330492973328, "kl": 0.054843402467668056, "learning_rate": 2.8866666666666673e-06, "loss": 0.0029, "num_tokens": 1436052.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 89.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.006455263122916222, "kl": 0.0006208866834640503, "learning_rate": 2.886111111111111e-06, "loss": 0.0, "num_tokens": 1436264.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 89.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.9311203956604, "kl": 0.0024815958458930254, "learning_rate": 2.885555555555556e-06, "loss": 0.0196, "num_tokens": 1436533.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 89.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 6.256448268890381, "kl": 0.04014924168586731, "learning_rate": 2.885e-06, "loss": 0.0784, "num_tokens": 1436796.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 89.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05009405314922333, "kl": 0.03531377390027046, "learning_rate": 2.8844444444444447e-06, "loss": 0.0018, "num_tokens": 1437183.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 89.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.028169749304652214, "kl": 0.02139719109982252, "learning_rate": 2.883888888888889e-06, "loss": 0.0011, "num_tokens": 1437475.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 89.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0336044505238533, "kl": 0.003804002481047064, "learning_rate": 2.8833333333333334e-06, "loss": 0.0002, "num_tokens": 1437753.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 89.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1677924543619156, "kl": 0.15725569427013397, "learning_rate": 2.882777777777778e-06, "loss": 0.0079, "num_tokens": 1438096.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 89.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.6651787757873535, "kl": 0.10249894857406616, "learning_rate": 2.8822222222222225e-06, "loss": 0.044, "num_tokens": 1438408.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 89.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.050119031220674515, "kl": 0.030887193977832794, "learning_rate": 2.881666666666667e-06, "loss": 0.0016, "num_tokens": 1438678.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4814 }, { "clip_ratio/high_max": 0.007042253389954567, "clip_ratio/high_mean": 0.007042253389954567, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007042253389954567, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 89.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.084737539291382, "kl": 0.13427647948265076, "learning_rate": 2.8811111111111112e-06, "loss": -0.0391, "num_tokens": 1439045.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 89.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.011118123307824135, "kl": 0.2533904016017914, "learning_rate": 2.880555555555556e-06, "loss": 0.0126, "num_tokens": 1439343.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 89.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.02352551557123661, "kl": 0.002032563090324402, "learning_rate": 2.88e-06, "loss": 0.0001, "num_tokens": 1439587.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 89.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.381362438201904, "kl": 0.5625092908740044, "learning_rate": 2.8794444444444447e-06, "loss": -0.021, "num_tokens": 1439934.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 4818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 89.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02724708989262581, "kl": 0.01213689660653472, "learning_rate": 2.8788888888888895e-06, "loss": 0.0006, "num_tokens": 1440194.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 89.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.025391120463609695, "kl": 0.0017361256177537143, "learning_rate": 2.8783333333333334e-06, "loss": 0.0001, "num_tokens": 1440454.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 89.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.08384673297405243, "kl": 0.007294699200429022, "learning_rate": 2.8777777777777782e-06, "loss": 0.0004, "num_tokens": 1440724.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 89.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.003523879684507847, "kl": 0.00042510032653808594, "learning_rate": 2.877222222222222e-06, "loss": 0.0, "num_tokens": 1440936.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 89.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.018686728551983833, "kl": 0.0016242980491369963, "learning_rate": 2.876666666666667e-06, "loss": 0.0001, "num_tokens": 1441259.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 89.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.02302592433989048, "kl": 0.012944246176630259, "learning_rate": 2.8761111111111113e-06, "loss": 0.0007, "num_tokens": 1441590.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 89.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.6197114586830139, "kl": 0.07467097043991089, "learning_rate": 2.875555555555556e-06, "loss": 0.0037, "num_tokens": 1441810.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 89.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02477312460541725, "kl": 0.004659267608076334, "learning_rate": 2.875e-06, "loss": 0.0002, "num_tokens": 1442094.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 89.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.010529886931180954, "kl": 0.0021914437384111807, "learning_rate": 2.8744444444444448e-06, "loss": 0.0001, "num_tokens": 1442305.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 89.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.027941560372710228, "kl": 0.002272442914545536, "learning_rate": 2.8738888888888896e-06, "loss": 0.0001, "num_tokens": 1442626.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 89.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01942768320441246, "kl": 0.0030540076550096273, "learning_rate": 2.8733333333333335e-06, "loss": 0.0002, "num_tokens": 1442908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 89.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.219028472900391, "kl": 0.13232440873980522, "learning_rate": 2.8727777777777783e-06, "loss": 0.076, "num_tokens": 1443227.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 89.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.3407925069332123, "kl": 0.07752988487482071, "learning_rate": 2.872222222222222e-06, "loss": 0.0043, "num_tokens": 1443506.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 89.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.24500350654125214, "kl": 0.17836903780698776, "learning_rate": 2.871666666666667e-06, "loss": 0.009, "num_tokens": 1443817.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 89.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.13647161424160004, "kl": 0.016069812700152397, "learning_rate": 2.8711111111111113e-06, "loss": 0.0009, "num_tokens": 1444116.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 89.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010651289485394955, "kl": 0.00021141172328498214, "learning_rate": 2.8705555555555557e-06, "loss": 0.0, "num_tokens": 1444372.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 89.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.013467757031321526, "kl": 0.014016717672348022, "learning_rate": 2.87e-06, "loss": 0.0007, "num_tokens": 1444684.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 89.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.4308698177337646, "kl": 0.01781844114884734, "learning_rate": 2.869444444444445e-06, "loss": 0.0017, "num_tokens": 1444978.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 89.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07078909128904343, "kl": 0.012397470884025097, "learning_rate": 2.868888888888889e-06, "loss": 0.0006, "num_tokens": 1445250.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 89.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.999603748321533, "kl": 0.060081735253334045, "learning_rate": 2.8683333333333335e-06, "loss": -0.0673, "num_tokens": 1445533.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 4838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 89.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.534012794494629, "kl": 0.15597239881753922, "learning_rate": 2.8677777777777783e-06, "loss": 0.1381, "num_tokens": 1445838.0, "reward": 4.125, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.308422088623047, "step": 4839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 89.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031830035150051117, "kl": 0.0033395234495401382, "learning_rate": 2.8672222222222223e-06, "loss": 0.0002, "num_tokens": 1446134.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4840 }, { "clip_ratio/high_max": 0.003496503457427025, "clip_ratio/high_mean": 0.003496503457427025, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003496503457427025, "completion_length": 58.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 89.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.8411778211593628, "kl": 0.05201287195086479, "learning_rate": 2.866666666666667e-06, "loss": -0.0025, "num_tokens": 1446646.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 4841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 89.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.15173836052417755, "kl": 0.011811357457190752, "learning_rate": 2.8661111111111114e-06, "loss": 0.0007, "num_tokens": 1446867.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 89.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.016390033066272736, "kl": 0.007416658103466034, "learning_rate": 2.8655555555555557e-06, "loss": 0.0004, "num_tokens": 1447103.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 89.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.022373508661985397, "kl": 0.0029942869732622057, "learning_rate": 2.865e-06, "loss": 0.0001, "num_tokens": 1447363.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 89.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05884520709514618, "kl": 0.004477381706237793, "learning_rate": 2.864444444444445e-06, "loss": 0.0002, "num_tokens": 1447575.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 89.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.0898470878601074, "kl": 0.06769691884983331, "learning_rate": 2.8638888888888892e-06, "loss": 0.0039, "num_tokens": 1447867.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 4846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 89.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.00818194355815649, "kl": 0.032538168132305145, "learning_rate": 2.8633333333333336e-06, "loss": 0.0016, "num_tokens": 1448083.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 89.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.032794348895549774, "kl": 0.016983441077172756, "learning_rate": 2.8627777777777784e-06, "loss": 0.0009, "num_tokens": 1448400.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 89.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 7.004114151000977, "kl": 0.04778917785733938, "learning_rate": 2.8622222222222223e-06, "loss": 0.3706, "num_tokens": 1448714.0, "reward": 5.625, "reward_std": 4.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 4.75, "step": 4849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 89.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.5944395065307617, "kl": 0.040906775277107954, "learning_rate": 2.861666666666667e-06, "loss": -0.0085, "num_tokens": 1449003.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 89.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.014634503982961178, "kl": 0.0008290750702144578, "learning_rate": 2.861111111111111e-06, "loss": 0.0, "num_tokens": 1449238.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 89.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08310246467590332, "kl": 0.03365144319832325, "learning_rate": 2.860555555555556e-06, "loss": 0.0017, "num_tokens": 1449573.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 89.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08817005902528763, "kl": 0.03402060177177191, "learning_rate": 2.86e-06, "loss": 0.0017, "num_tokens": 1449891.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 89.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04497672617435455, "kl": 0.019793631974607706, "learning_rate": 2.8594444444444445e-06, "loss": 0.001, "num_tokens": 1450220.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 89.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.3252103328704834, "kl": 0.04133635759353638, "learning_rate": 2.8588888888888893e-06, "loss": 0.0802, "num_tokens": 1450555.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 89.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01607813686132431, "kl": 0.0037074702559038997, "learning_rate": 2.8583333333333336e-06, "loss": 0.0002, "num_tokens": 1450859.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 89.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.034757185727357864, "kl": 0.00828217202797532, "learning_rate": 2.8577777777777784e-06, "loss": 0.0004, "num_tokens": 1451143.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 89.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.030329445376992226, "kl": 0.07273171842098236, "learning_rate": 2.8572222222222223e-06, "loss": 0.0036, "num_tokens": 1451446.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.5, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 89.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.846146821975708, "kl": 0.02000413089990616, "learning_rate": 2.856666666666667e-06, "loss": 0.1308, "num_tokens": 1451864.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 90.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.002300429390743375, "kl": 0.090362548828125, "learning_rate": 2.856111111111111e-06, "loss": 0.0045, "num_tokens": 1452228.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 90.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10487981140613556, "kl": 0.060250423848629, "learning_rate": 2.855555555555556e-06, "loss": 0.003, "num_tokens": 1452500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 90.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.036186590790748596, "kl": 0.004052322474308312, "learning_rate": 2.855e-06, "loss": 0.0002, "num_tokens": 1452778.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 90.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.0114080905914307, "kl": 0.12553629651665688, "learning_rate": 2.8544444444444445e-06, "loss": 0.0733, "num_tokens": 1453109.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 90.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.1235545203089714, "kl": 0.027800515294075012, "learning_rate": 2.8538888888888893e-06, "loss": 0.0014, "num_tokens": 1453446.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 90.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.431391954421997, "kl": 0.08120128139853477, "learning_rate": 2.8533333333333337e-06, "loss": 0.0143, "num_tokens": 1453761.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 90.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.7327183485031128, "kl": 0.1958077847957611, "learning_rate": 2.852777777777778e-06, "loss": 0.0294, "num_tokens": 1454127.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 90.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.17283986508846283, "kl": 0.04650625213980675, "learning_rate": 2.8522222222222224e-06, "loss": 0.0023, "num_tokens": 1454445.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 90.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008652308955788612, "kl": 0.000643781982944347, "learning_rate": 2.851666666666667e-06, "loss": 0.0, "num_tokens": 1454680.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 8.132338523864746, "kl": 1.4835994282038882, "learning_rate": 2.851111111111111e-06, "loss": 0.1665, "num_tokens": 1454978.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 4869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 90.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.006800313014537096, "kl": 0.000649869441986084, "learning_rate": 2.850555555555556e-06, "loss": 0.0, "num_tokens": 1455190.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 90.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.07740820944309235, "kl": 0.022672228747978806, "learning_rate": 2.85e-06, "loss": 0.0011, "num_tokens": 1455521.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 90.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04422234371304512, "kl": 0.0053061917424201965, "learning_rate": 2.8494444444444446e-06, "loss": 0.0003, "num_tokens": 1455781.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 90.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.018767599016427994, "kl": 0.009051062632352114, "learning_rate": 2.8488888888888894e-06, "loss": 0.0005, "num_tokens": 1456109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 90.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.05369754135608673, "kl": 0.016737197525799274, "learning_rate": 2.8483333333333333e-06, "loss": 0.0008, "num_tokens": 1456370.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 90.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.062246717512607574, "kl": 0.013361991383135319, "learning_rate": 2.847777777777778e-06, "loss": 0.0008, "num_tokens": 1456626.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.08313967287540436, "kl": 0.007548253750428557, "learning_rate": 2.8472222222222224e-06, "loss": 0.0004, "num_tokens": 1456900.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 90.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.06714726239442825, "kl": 0.06006982363760471, "learning_rate": 2.8466666666666672e-06, "loss": 0.003, "num_tokens": 1457356.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 90.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.6901651620864868, "kl": 0.09474997967481613, "learning_rate": 2.846111111111111e-06, "loss": 0.0049, "num_tokens": 1457657.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 90.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0327080599963665, "kl": 0.016239826567471027, "learning_rate": 2.845555555555556e-06, "loss": 0.0008, "num_tokens": 1458006.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 90.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06343644857406616, "kl": 0.003963956143707037, "learning_rate": 2.845e-06, "loss": 0.0002, "num_tokens": 1458276.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 90.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.022110015153884888, "kl": 0.004661545157432556, "learning_rate": 2.8444444444444446e-06, "loss": 0.0002, "num_tokens": 1458536.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 90.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.029195507988333702, "kl": 0.16498678922653198, "learning_rate": 2.8438888888888894e-06, "loss": 0.0082, "num_tokens": 1458846.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 90.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 5.1973085646750405e-05, "kl": 5.543231964111328e-06, "learning_rate": 2.8433333333333334e-06, "loss": 0.0, "num_tokens": 1459066.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 90.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09128483384847641, "kl": 0.14410996437072754, "learning_rate": 2.842777777777778e-06, "loss": 0.0072, "num_tokens": 1459395.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 90.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.050109997391700745, "kl": 0.012101458851248026, "learning_rate": 2.8422222222222225e-06, "loss": 0.0006, "num_tokens": 1459685.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 90.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004176632966846228, "kl": 0.0001462697982788086, "learning_rate": 2.841666666666667e-06, "loss": 0.0, "num_tokens": 1459941.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 90.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0728953629732132, "kl": 0.03841681685298681, "learning_rate": 2.841111111111111e-06, "loss": 0.0021, "num_tokens": 1460213.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 90.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 1.0549254417419434, "kl": 0.39489758014678955, "learning_rate": 2.840555555555556e-06, "loss": 0.0197, "num_tokens": 1460449.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 90.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036553097888827324, "kl": 0.0011572976945899427, "learning_rate": 2.84e-06, "loss": 0.0001, "num_tokens": 1460769.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 90.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.011253544129431248, "kl": 0.0008504624711349607, "learning_rate": 2.8394444444444447e-06, "loss": 0.0, "num_tokens": 1461049.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 90.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.018235715106129646, "kl": 0.017184725031256676, "learning_rate": 2.8388888888888895e-06, "loss": 0.0009, "num_tokens": 1461341.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 90.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 2.418029546737671, "kl": 0.2859012186527252, "learning_rate": 2.8383333333333334e-06, "loss": 0.0143, "num_tokens": 1461637.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 90.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.0655434131622314, "kl": 0.13288148492574692, "learning_rate": 2.837777777777778e-06, "loss": 0.0069, "num_tokens": 1462016.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 4893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 90.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03875565156340599, "kl": 0.012179846875369549, "learning_rate": 2.837222222222222e-06, "loss": 0.0006, "num_tokens": 1462328.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 90.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.1030774116516113, "kl": 0.06543300114572048, "learning_rate": 2.836666666666667e-06, "loss": 0.0534, "num_tokens": 1462675.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 4895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 90.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 14.857139587402344, "kl": 0.09899831563234329, "learning_rate": 2.8361111111111113e-06, "loss": 0.2241, "num_tokens": 1462884.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 4896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.013368293642997742, "kl": 0.004036736208945513, "learning_rate": 2.835555555555556e-06, "loss": 0.0002, "num_tokens": 1463168.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 90.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 5.983294486999512, "kl": 0.037123020738363266, "learning_rate": 2.835e-06, "loss": 0.3652, "num_tokens": 1463407.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 90.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0792025700211525, "kl": 0.036469035781919956, "learning_rate": 2.8344444444444447e-06, "loss": 0.0018, "num_tokens": 1463678.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.1343344748020172, "kl": 0.023460193537175655, "learning_rate": 2.8338888888888895e-06, "loss": 0.0012, "num_tokens": 1463952.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 90.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.027985036373138428, "kl": 0.004133456735871732, "learning_rate": 2.8333333333333335e-06, "loss": 0.0002, "num_tokens": 1464256.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 90.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.7723889350891113, "kl": 0.12027215585112572, "learning_rate": 2.8327777777777782e-06, "loss": -0.061, "num_tokens": 1464614.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 90.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.18339866399765015, "kl": 0.04294145293533802, "learning_rate": 2.832222222222222e-06, "loss": 0.0023, "num_tokens": 1464922.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 75.25, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 90.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 4.0937981605529785, "kl": 0.26083686016499996, "learning_rate": 2.831666666666667e-06, "loss": 0.4623, "num_tokens": 1465443.0, "reward": 6.550000190734863, "reward_std": 1.899999976158142, "rewards/reward_combined/mean": 6.550000190734863, "rewards/reward_combined/std": 1.8999998569488525, "step": 4904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 3.7317519187927246, "kl": 0.13249099836684763, "learning_rate": 2.8311111111111113e-06, "loss": 0.1488, "num_tokens": 1465729.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 4905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 90.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00852871872484684, "kl": 0.0019235644140280783, "learning_rate": 2.8305555555555557e-06, "loss": 0.0001, "num_tokens": 1466041.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 90.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06305994838476181, "kl": 0.002970291650854051, "learning_rate": 2.83e-06, "loss": 0.0001, "num_tokens": 1466260.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 90.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01943863555788994, "kl": 0.25432051718235016, "learning_rate": 2.829444444444445e-06, "loss": 0.0127, "num_tokens": 1466558.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 90.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.19865553081035614, "kl": 0.04876820556819439, "learning_rate": 2.828888888888889e-06, "loss": 0.0025, "num_tokens": 1466845.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 90.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.023191552609205246, "kl": 0.0011610090732574463, "learning_rate": 2.8283333333333335e-06, "loss": 0.0001, "num_tokens": 1467105.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 90.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.23770447075366974, "kl": 0.05795022565871477, "learning_rate": 2.8277777777777783e-06, "loss": 0.0027, "num_tokens": 1467509.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 90.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.787396192550659, "kl": 0.12714413553476334, "learning_rate": 2.827222222222222e-06, "loss": 0.1049, "num_tokens": 1467858.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 4912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 90.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.010787297040224075, "kl": 0.0020196884870529175, "learning_rate": 2.826666666666667e-06, "loss": 0.0001, "num_tokens": 1468102.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.006521459668874741, "kl": 0.00045975297689437866, "learning_rate": 2.8261111111111113e-06, "loss": 0.0, "num_tokens": 1468314.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 91.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.17207679152488708, "kl": 0.051830749958753586, "learning_rate": 2.8255555555555557e-06, "loss": 0.0027, "num_tokens": 1468586.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.020694339647889137, "kl": 0.005713142454624176, "learning_rate": 2.825e-06, "loss": 0.0003, "num_tokens": 1468822.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 91.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.0638985633850098, "kl": 0.16342266276478767, "learning_rate": 2.824444444444445e-06, "loss": -0.0816, "num_tokens": 1469200.0, "reward": 6.875, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 1.6007810831069946, "step": 4917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 91.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05869205296039581, "kl": 0.0057607407215982676, "learning_rate": 2.823888888888889e-06, "loss": 0.0003, "num_tokens": 1469462.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 91.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.004826087970286608, "kl": 0.0007470070850104094, "learning_rate": 2.8233333333333335e-06, "loss": 0.0, "num_tokens": 1469682.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 91.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.12627437710762024, "kl": 0.03437039256095886, "learning_rate": 2.8227777777777783e-06, "loss": 0.0018, "num_tokens": 1469968.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 91.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.4338767528533936, "kl": 0.10307991877198219, "learning_rate": 2.8222222222222223e-06, "loss": 0.0745, "num_tokens": 1470320.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 4921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 91.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03952951729297638, "kl": 0.008365146815776825, "learning_rate": 2.821666666666667e-06, "loss": 0.0004, "num_tokens": 1470650.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 91.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.1549803465604782, "kl": 0.14358188211917877, "learning_rate": 2.821111111111111e-06, "loss": 0.0072, "num_tokens": 1470977.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 91.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.007395025342702866, "kl": 0.0007459734915755689, "learning_rate": 2.8205555555555557e-06, "loss": 0.0, "num_tokens": 1471239.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 91.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03010215051472187, "kl": 0.002581228851340711, "learning_rate": 2.82e-06, "loss": 0.0001, "num_tokens": 1471553.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 83.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 83.25, "completions/mean_terminated_length": 25.666667938232422, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 91.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.1196951866149902, "kl": 0.24389082193374634, "learning_rate": 2.8194444444444445e-06, "loss": 0.5767, "num_tokens": 1472106.0, "reward": 3.375, "reward_std": 5.313112735748291, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 5.313112735748291, "step": 4926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 8.300096669699997e-05, "kl": 6.951391696929932e-06, "learning_rate": 2.8188888888888892e-06, "loss": 0.0, "num_tokens": 1472326.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 91.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.007000116165727377, "kl": 0.0010108500719070435, "learning_rate": 2.8183333333333336e-06, "loss": 0.0001, "num_tokens": 1472538.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 91.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.11971472948789597, "kl": 0.008719593286514282, "learning_rate": 2.8177777777777784e-06, "loss": 0.0004, "num_tokens": 1472742.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 91.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.05542629584670067, "kl": 0.013356880750507116, "learning_rate": 2.8172222222222223e-06, "loss": 0.0007, "num_tokens": 1473034.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 91.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.009386456571519375, "kl": 0.000432651664596051, "learning_rate": 2.816666666666667e-06, "loss": 0.0, "num_tokens": 1473312.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 91.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.032122671604156494, "kl": 0.01088618766516447, "learning_rate": 2.816111111111111e-06, "loss": 0.0005, "num_tokens": 1473624.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 91.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02214672975242138, "kl": 0.0026894129696302116, "learning_rate": 2.815555555555556e-06, "loss": 0.0001, "num_tokens": 1473952.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 91.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1270742267370224, "kl": 0.17635396122932434, "learning_rate": 2.815e-06, "loss": 0.0087, "num_tokens": 1474300.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 91.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.028379518538713455, "kl": 0.025653766468167305, "learning_rate": 2.8144444444444445e-06, "loss": 0.0013, "num_tokens": 1474637.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 91.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.024134216830134392, "kl": 0.0015934616676531732, "learning_rate": 2.8138888888888893e-06, "loss": 0.0001, "num_tokens": 1474893.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 91.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.4014358520507812, "kl": 0.0912390761077404, "learning_rate": 2.8133333333333336e-06, "loss": 0.1596, "num_tokens": 1475209.0, "reward": 5.925000190734863, "reward_std": 4.150000095367432, "rewards/reward_combined/mean": 5.925000190734863, "rewards/reward_combined/std": 4.150000095367432, "step": 4937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 91.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.02646678127348423, "kl": 0.004789587110280991, "learning_rate": 2.812777777777778e-06, "loss": 0.0002, "num_tokens": 1475469.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 91.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.011692190542817116, "kl": 0.0022427579388022423, "learning_rate": 2.8122222222222224e-06, "loss": 0.0001, "num_tokens": 1475748.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 91.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05923501402139664, "kl": 0.01927915122359991, "learning_rate": 2.811666666666667e-06, "loss": 0.001, "num_tokens": 1476009.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 91.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.041922908276319504, "kl": 0.01175348786637187, "learning_rate": 2.811111111111111e-06, "loss": 0.0006, "num_tokens": 1476282.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 91.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.015035273507237434, "kl": 0.004151653847657144, "learning_rate": 2.810555555555556e-06, "loss": 0.0002, "num_tokens": 1476566.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 91.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021980577148497105, "kl": 0.0002934025105787441, "learning_rate": 2.8100000000000006e-06, "loss": 0.0, "num_tokens": 1476862.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 91.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.11419152468442917, "kl": 0.025345132686197758, "learning_rate": 2.8094444444444446e-06, "loss": 0.0013, "num_tokens": 1477196.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 91.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00244551757350564, "kl": 0.09037994593381882, "learning_rate": 2.8088888888888893e-06, "loss": 0.0045, "num_tokens": 1477560.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 91.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.028631728142499924, "kl": 0.028330635279417038, "learning_rate": 2.8083333333333333e-06, "loss": 0.0013, "num_tokens": 1477919.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 91.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03904659301042557, "kl": 0.012673735618591309, "learning_rate": 2.807777777777778e-06, "loss": 0.0006, "num_tokens": 1478195.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 91.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.030014682561159134, "kl": 0.005038786213845015, "learning_rate": 2.8072222222222224e-06, "loss": 0.0002, "num_tokens": 1478497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 91.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1154070794582367, "kl": 0.031238640658557415, "learning_rate": 2.806666666666667e-06, "loss": 0.0018, "num_tokens": 1478781.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.6998348236083984, "kl": 0.13487105816602707, "learning_rate": 2.806111111111111e-06, "loss": 0.0096, "num_tokens": 1479008.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 91.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.06551746279001236, "kl": 0.002438159193843603, "learning_rate": 2.805555555555556e-06, "loss": 0.0001, "num_tokens": 1479221.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 91.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.014894743449985981, "kl": 0.001786562817869708, "learning_rate": 2.8050000000000007e-06, "loss": 0.0001, "num_tokens": 1479481.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 91.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06567440181970596, "kl": 0.031088879331946373, "learning_rate": 2.8044444444444446e-06, "loss": 0.0016, "num_tokens": 1479848.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 91.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.025830194354057312, "kl": 0.00834212196059525, "learning_rate": 2.8038888888888894e-06, "loss": 0.0004, "num_tokens": 1480179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 91.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.40670710802078247, "kl": 0.13871371746063232, "learning_rate": 2.8033333333333333e-06, "loss": 0.0066, "num_tokens": 1480539.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 91.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.012978222221136093, "kl": 0.0015529915690422058, "learning_rate": 2.802777777777778e-06, "loss": 0.0001, "num_tokens": 1480783.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 91.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 3.60384202003479, "kl": 0.33449617214500904, "learning_rate": 2.8022222222222225e-06, "loss": 0.2473, "num_tokens": 1481103.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 4957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 91.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 5.808745384216309, "kl": 0.16878405213356018, "learning_rate": 2.801666666666667e-06, "loss": -0.1705, "num_tokens": 1481411.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 4958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 91.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05330829694867134, "kl": 0.12368893623352051, "learning_rate": 2.801111111111111e-06, "loss": 0.0062, "num_tokens": 1481713.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 91.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06136613339185715, "kl": 0.0051450037863105536, "learning_rate": 2.800555555555556e-06, "loss": 0.0003, "num_tokens": 1481980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 91.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.4691572189331055, "kl": 0.3032219856977463, "learning_rate": 2.8000000000000003e-06, "loss": 0.0141, "num_tokens": 1482289.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 4961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 91.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.583392143249512, "kl": 0.08900807052850723, "learning_rate": 2.7994444444444447e-06, "loss": 0.0387, "num_tokens": 1482557.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 91.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.010300767607986927, "kl": 0.0008725697989575565, "learning_rate": 2.7988888888888894e-06, "loss": 0.0, "num_tokens": 1482829.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 91.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.685302734375, "kl": 0.01642204448580742, "learning_rate": 2.7983333333333334e-06, "loss": 0.0037, "num_tokens": 1483149.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 4964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 91.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943245828151703, "kl": 0.03949319198727608, "learning_rate": 2.797777777777778e-06, "loss": 0.002, "num_tokens": 1483445.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 91.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.004218784626573324, "kl": 0.00035923009272664785, "learning_rate": 2.797222222222222e-06, "loss": 0.0, "num_tokens": 1483680.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 91.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.2195872068405151, "kl": 0.08851400390267372, "learning_rate": 2.796666666666667e-06, "loss": 0.03, "num_tokens": 1484136.0, "reward": 1.0, "reward_std": 1.3540064096450806, "rewards/reward_combined/mean": 1.0, "rewards/reward_combined/std": 1.3540064096450806, "step": 4967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 92.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.018940400332212448, "kl": 0.25428570061922073, "learning_rate": 2.796111111111111e-06, "loss": 0.0127, "num_tokens": 1484434.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 92.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.984536170959473, "kl": 0.14363085851073265, "learning_rate": 2.795555555555556e-06, "loss": 0.0356, "num_tokens": 1484716.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 92.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.011612504720687866, "kl": 0.0004976913332939148, "learning_rate": 2.7950000000000003e-06, "loss": 0.0, "num_tokens": 1484928.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 92.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06355352699756622, "kl": 0.03838350251317024, "learning_rate": 2.7944444444444447e-06, "loss": 0.0019, "num_tokens": 1485255.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 92.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.13037832081317902, "kl": 0.031788150779902935, "learning_rate": 2.7938888888888895e-06, "loss": 0.0017, "num_tokens": 1485555.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 92.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.12419334799051285, "kl": 0.054729998111724854, "learning_rate": 2.7933333333333334e-06, "loss": 0.0023, "num_tokens": 1485888.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 92.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01896822825074196, "kl": 0.000345379114151001, "learning_rate": 2.792777777777778e-06, "loss": 0.0, "num_tokens": 1486144.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 92.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.22054152190685272, "kl": 0.0560910739004612, "learning_rate": 2.792222222222222e-06, "loss": 0.0027, "num_tokens": 1486435.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 92.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125839233398438, "kl": 0.4322813153266907, "learning_rate": 2.791666666666667e-06, "loss": -0.062, "num_tokens": 1486743.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 4976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 92.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.02058422937989235, "kl": 0.0026808008551597595, "learning_rate": 2.7911111111111113e-06, "loss": 0.0001, "num_tokens": 1486951.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 92.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.0358188152313232, "kl": 0.0355022381991148, "learning_rate": 2.7905555555555556e-06, "loss": -0.2292, "num_tokens": 1487313.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 92.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.22116388380527496, "kl": 0.045429942198097706, "learning_rate": 2.7900000000000004e-06, "loss": 0.0023, "num_tokens": 1487611.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 92.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.015524551272392273, "kl": 0.01333358883857727, "learning_rate": 2.7894444444444447e-06, "loss": 0.0007, "num_tokens": 1487923.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 92.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.06204724311828613, "kl": 0.06648273207247257, "learning_rate": 2.788888888888889e-06, "loss": 0.0033, "num_tokens": 1488375.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 92.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.008291205391287804, "kl": 0.004125263541936874, "learning_rate": 2.7883333333333335e-06, "loss": 0.0002, "num_tokens": 1488635.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 92.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.010824425145983696, "kl": 0.004578584106639028, "learning_rate": 2.7877777777777782e-06, "loss": 0.0002, "num_tokens": 1488899.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 92.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01868986152112484, "kl": 0.03211461007595062, "learning_rate": 2.787222222222222e-06, "loss": 0.0016, "num_tokens": 1489115.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 92.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 1.8572367429733276, "kl": 0.08655886352062225, "learning_rate": 2.786666666666667e-06, "loss": -0.3113, "num_tokens": 1489529.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 92.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.8116070628166199, "kl": 0.1693599782884121, "learning_rate": 2.7861111111111113e-06, "loss": -0.0006, "num_tokens": 1489892.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 4986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 92.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04521380364894867, "kl": 0.03534220810979605, "learning_rate": 2.7855555555555557e-06, "loss": 0.0018, "num_tokens": 1490164.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 92.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.014243884943425655, "kl": 0.0012076988932676613, "learning_rate": 2.7850000000000004e-06, "loss": 0.0001, "num_tokens": 1490420.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 92.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 9.984898497350514e-05, "kl": 7.875263690948486e-06, "learning_rate": 2.784444444444445e-06, "loss": 0.0, "num_tokens": 1490640.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 4989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 92.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05073999613523483, "kl": 0.05926734767854214, "learning_rate": 2.783888888888889e-06, "loss": 0.003, "num_tokens": 1490933.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 92.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.23206155002117157, "kl": 0.15123724192380905, "learning_rate": 2.7833333333333335e-06, "loss": 0.0076, "num_tokens": 1491169.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 92.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.13339003920555115, "kl": 0.17905985563993454, "learning_rate": 2.7827777777777783e-06, "loss": 0.009, "num_tokens": 1491506.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 4992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 92.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.028354067355394363, "kl": 0.0008943932771217078, "learning_rate": 2.7822222222222222e-06, "loss": 0.0, "num_tokens": 1491728.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 4993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 92.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.6117894649505615, "kl": 0.0016838558367453516, "learning_rate": 2.781666666666667e-06, "loss": -0.0331, "num_tokens": 1492008.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 4994 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.011904762126505375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011904762126505375, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 92.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.2906887531280518, "kl": 0.10448650363832712, "learning_rate": 2.781111111111111e-06, "loss": -0.027, "num_tokens": 1492338.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 4995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 92.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.020351186394691467, "kl": 0.00196040648734197, "learning_rate": 2.7805555555555557e-06, "loss": 0.0001, "num_tokens": 1492658.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 92.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.13580074906349182, "kl": 0.01832009630743414, "learning_rate": 2.7800000000000005e-06, "loss": 0.0009, "num_tokens": 1492918.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 4997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 92.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03725472092628479, "kl": 0.001846298633608967, "learning_rate": 2.7794444444444444e-06, "loss": 0.0001, "num_tokens": 1493188.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 4998 }, { "clip_ratio/high_max": 0.008064515888690948, "clip_ratio/high_mean": 0.008064515888690948, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008064515888690948, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 92.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.9973695278167725, "kl": 0.1993519924581051, "learning_rate": 2.778888888888889e-06, "loss": 0.0747, "num_tokens": 1493545.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 4999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 92.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.9400177597999573, "kl": 0.1721017174422741, "learning_rate": 2.7783333333333336e-06, "loss": 0.0087, "num_tokens": 1493834.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 92.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.022396700456738472, "kl": 0.24077923595905304, "learning_rate": 2.7777777777777783e-06, "loss": 0.012, "num_tokens": 1494134.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 92.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.026439689099788666, "kl": 0.008886401541531086, "learning_rate": 2.7772222222222223e-06, "loss": 0.0004, "num_tokens": 1494459.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 92.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.062089353799819946, "kl": 0.0051193842664361, "learning_rate": 2.776666666666667e-06, "loss": 0.0002, "num_tokens": 1494722.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 92.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.12456871569156647, "kl": 0.02017850778065622, "learning_rate": 2.776111111111111e-06, "loss": 0.001, "num_tokens": 1495007.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 92.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0521509125828743, "kl": 0.0106089455075562, "learning_rate": 2.7755555555555558e-06, "loss": 0.0005, "num_tokens": 1495291.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 92.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.08016502112150192, "kl": 0.16016560420393944, "learning_rate": 2.7750000000000005e-06, "loss": 0.008, "num_tokens": 1495624.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004672897048294544, "clip_ratio/low_min": 0.004672897048294544, "clip_ratio/region_mean": 0.004672897048294544, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 92.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.6617813110351562, "kl": 0.033838531002402306, "learning_rate": 2.7744444444444445e-06, "loss": 0.2467, "num_tokens": 1496002.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 5007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 92.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.029486751183867455, "kl": 0.0023616646649315953, "learning_rate": 2.7738888888888892e-06, "loss": 0.0001, "num_tokens": 1496315.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 92.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.030295241624116898, "kl": 0.01039809500798583, "learning_rate": 2.7733333333333336e-06, "loss": 0.0005, "num_tokens": 1496587.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 92.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.486746072769165, "kl": 0.1482328474521637, "learning_rate": 2.772777777777778e-06, "loss": 0.0061, "num_tokens": 1496950.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 92.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.019172614440321922, "kl": 0.0029136231169104576, "learning_rate": 2.7722222222222223e-06, "loss": 0.0001, "num_tokens": 1497229.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 92.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.018801894038915634, "kl": 0.014061120338737965, "learning_rate": 2.771666666666667e-06, "loss": 0.0007, "num_tokens": 1497489.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 92.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.07402580976486206, "kl": 0.019177472218871117, "learning_rate": 2.771111111111111e-06, "loss": 0.0009, "num_tokens": 1497837.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 92.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06623672693967819, "kl": 0.021303815999999642, "learning_rate": 2.770555555555556e-06, "loss": 0.0011, "num_tokens": 1498141.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 92.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.014961686916649342, "kl": 0.1729404702782631, "learning_rate": 2.7700000000000006e-06, "loss": 0.0086, "num_tokens": 1498449.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 92.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.649931788444519, "kl": 0.08994963765144348, "learning_rate": 2.7694444444444445e-06, "loss": 0.0042, "num_tokens": 1498692.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 92.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.006155845243483782, "kl": 0.00045008957386016846, "learning_rate": 2.7688888888888893e-06, "loss": 0.0, "num_tokens": 1498904.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 92.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.19400769472122192, "kl": 0.029013704508543015, "learning_rate": 2.7683333333333332e-06, "loss": 0.0015, "num_tokens": 1499194.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 92.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 5.192575454711914, "kl": 0.013375247828662395, "learning_rate": 2.767777777777778e-06, "loss": 0.084, "num_tokens": 1499490.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 78.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 78.25, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 92.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.775316596031189, "kl": 0.052269471809268, "learning_rate": 2.7672222222222224e-06, "loss": 0.5888, "num_tokens": 1500023.0, "reward": 3.049999952316284, "reward_std": 5.499393939971924, "rewards/reward_combined/mean": 3.049999952316284, "rewards/reward_combined/std": 5.499393939971924, "step": 5020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 92.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08554728329181671, "kl": 0.00810362517950125, "learning_rate": 2.766666666666667e-06, "loss": 0.0005, "num_tokens": 1500261.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09275481104850769, "kl": 0.026056982576847076, "learning_rate": 2.766111111111111e-06, "loss": 0.0013, "num_tokens": 1500547.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 93.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.38333091139793396, "kl": 0.06717650964856148, "learning_rate": 2.765555555555556e-06, "loss": 0.0027, "num_tokens": 1500868.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07929928600788116, "kl": 0.021975215524435043, "learning_rate": 2.7650000000000006e-06, "loss": 0.0011, "num_tokens": 1501155.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.016646021977066994, "kl": 0.0028654199559241533, "learning_rate": 2.7644444444444446e-06, "loss": 0.0001, "num_tokens": 1501439.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 93.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.007310925051569939, "kl": 0.0013581174425780773, "learning_rate": 2.7638888888888893e-06, "loss": 0.0001, "num_tokens": 1501762.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 93.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.2337168455123901, "kl": 0.0896470844745636, "learning_rate": 2.7633333333333333e-06, "loss": 0.0069, "num_tokens": 1502112.0, "reward": 4.375, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 2.0966243743896484, "step": 5027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 93.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.047149691730737686, "kl": 0.0039001624390948564, "learning_rate": 2.762777777777778e-06, "loss": 0.0002, "num_tokens": 1502346.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 93.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.013722055591642857, "kl": 0.0038920383667573333, "learning_rate": 2.7622222222222224e-06, "loss": 0.0002, "num_tokens": 1502636.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 93.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.9267874956130981, "kl": 0.035311490297317505, "learning_rate": 2.7616666666666668e-06, "loss": 0.0042, "num_tokens": 1502908.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 93.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.007265117950737476, "kl": 0.0009740889072418213, "learning_rate": 2.761111111111111e-06, "loss": 0.0, "num_tokens": 1503120.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 93.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.010357854887843132, "kl": 0.002179914270527661, "learning_rate": 2.760555555555556e-06, "loss": 0.0001, "num_tokens": 1503397.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 93.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.010358144529163837, "kl": 0.0017091259360313416, "learning_rate": 2.7600000000000003e-06, "loss": 0.0001, "num_tokens": 1503641.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.5297973155975342, "kl": 0.11764344945549965, "learning_rate": 2.7594444444444446e-06, "loss": 0.0068, "num_tokens": 1503946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 93.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.01556004211306572, "kl": 0.014739900827407837, "learning_rate": 2.7588888888888894e-06, "loss": 0.0007, "num_tokens": 1504206.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 93.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.053984325379133224, "kl": 0.01710640825331211, "learning_rate": 2.7583333333333333e-06, "loss": 0.0009, "num_tokens": 1504542.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 93.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.23714937269687653, "kl": 0.04022805951535702, "learning_rate": 2.757777777777778e-06, "loss": 0.0017, "num_tokens": 1504814.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 93.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040569850243628025, "kl": 0.0003046929923584685, "learning_rate": 2.757222222222222e-06, "loss": 0.0, "num_tokens": 1505034.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 93.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.2013729363679886, "kl": 0.16692669689655304, "learning_rate": 2.756666666666667e-06, "loss": 0.0083, "num_tokens": 1505393.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 93.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.954748272895813, "kl": 0.0929417759180069, "learning_rate": 2.756111111111111e-06, "loss": 0.0736, "num_tokens": 1505769.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 5040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 93.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.029706912115216255, "kl": 0.058671239763498306, "learning_rate": 2.755555555555556e-06, "loss": 0.0029, "num_tokens": 1506229.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 93.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 7.92524528503418, "kl": 0.23659903556108475, "learning_rate": 2.7550000000000003e-06, "loss": -0.0464, "num_tokens": 1506438.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 5042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 93.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.17812219262123108, "kl": 0.02633110247552395, "learning_rate": 2.7544444444444447e-06, "loss": 0.0013, "num_tokens": 1506754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 93.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.015828240662813187, "kl": 0.0018713559256866574, "learning_rate": 2.7538888888888894e-06, "loss": 0.0001, "num_tokens": 1507024.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 93.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.892978668212891, "kl": 0.034975471906363964, "learning_rate": 2.7533333333333334e-06, "loss": 0.3335, "num_tokens": 1507337.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 93.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.019078928977251053, "kl": 0.0005303248763084412, "learning_rate": 2.752777777777778e-06, "loss": 0.0, "num_tokens": 1507549.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 93.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02164502441883087, "kl": 0.0043364763259887695, "learning_rate": 2.752222222222222e-06, "loss": 0.0002, "num_tokens": 1507809.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 93.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03519782796502113, "kl": 0.006422429869417101, "learning_rate": 2.751666666666667e-06, "loss": 0.0003, "num_tokens": 1508118.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 93.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.00728233065456152, "kl": 0.22696757316589355, "learning_rate": 2.7511111111111112e-06, "loss": 0.0113, "num_tokens": 1508420.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 93.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.011318168602883816, "kl": 0.00020128190226387233, "learning_rate": 2.7505555555555556e-06, "loss": 0.0, "num_tokens": 1508676.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 93.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0922418013215065, "kl": 0.03475791960954666, "learning_rate": 2.7500000000000004e-06, "loss": 0.0018, "num_tokens": 1508895.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 93.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.015184298157691956, "kl": 0.013483801856637001, "learning_rate": 2.7494444444444447e-06, "loss": 0.0007, "num_tokens": 1509207.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 93.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.490389347076416, "kl": 0.06389935780316591, "learning_rate": 2.748888888888889e-06, "loss": 0.0863, "num_tokens": 1509508.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 93.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.9382667541503906, "kl": 0.0890839472413063, "learning_rate": 2.7483333333333334e-06, "loss": -0.0255, "num_tokens": 1509817.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 93.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02136409841477871, "kl": 0.015968775376677513, "learning_rate": 2.747777777777778e-06, "loss": 0.0008, "num_tokens": 1510109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 93.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 1.4844086170196533, "kl": 0.18187648244202137, "learning_rate": 2.747222222222222e-06, "loss": 0.0125, "num_tokens": 1510432.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 93.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08699223399162292, "kl": 0.01288333086995408, "learning_rate": 2.746666666666667e-06, "loss": 0.0007, "num_tokens": 1510732.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 93.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.07706451416015625, "kl": 0.1020544171333313, "learning_rate": 2.7461111111111113e-06, "loss": 0.0051, "num_tokens": 1511041.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 93.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08401250094175339, "kl": 0.0472187390550971, "learning_rate": 2.7455555555555556e-06, "loss": 0.0023, "num_tokens": 1511377.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 93.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 3.03329525195295e-05, "kl": 4.716217517852783e-06, "learning_rate": 2.7450000000000004e-06, "loss": 0.0, "num_tokens": 1511597.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 93.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 1.6642102003097534, "kl": 0.043778685852885246, "learning_rate": 2.7444444444444448e-06, "loss": 0.0301, "num_tokens": 1511958.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 93.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.002377377822995186, "kl": 0.09046494215726852, "learning_rate": 2.743888888888889e-06, "loss": 0.0045, "num_tokens": 1512322.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 93.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04796796664595604, "kl": 0.021542207337915897, "learning_rate": 2.7433333333333335e-06, "loss": 0.0011, "num_tokens": 1512590.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 93.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04202129691839218, "kl": 0.008260078262537718, "learning_rate": 2.7427777777777782e-06, "loss": 0.0004, "num_tokens": 1512920.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 93.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.026542678475379944, "kl": 0.02726731658913195, "learning_rate": 2.742222222222222e-06, "loss": 0.0013, "num_tokens": 1513246.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 93.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02056683786213398, "kl": 0.0029771873960271478, "learning_rate": 2.741666666666667e-06, "loss": 0.0001, "num_tokens": 1513506.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 93.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03831171244382858, "kl": 0.0024818044621497393, "learning_rate": 2.7411111111111117e-06, "loss": 0.0001, "num_tokens": 1513766.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 93.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04587572067975998, "kl": 0.03362428583204746, "learning_rate": 2.7405555555555557e-06, "loss": 0.0017, "num_tokens": 1514040.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 93.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.19027413427829742, "kl": 0.02696849629865028, "learning_rate": 2.7400000000000004e-06, "loss": 0.0018, "num_tokens": 1514312.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 93.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05181529000401497, "kl": 0.06741039082407951, "learning_rate": 2.7394444444444444e-06, "loss": 0.0034, "num_tokens": 1514621.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 93.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.020936716347932816, "kl": 0.011241118423640728, "learning_rate": 2.738888888888889e-06, "loss": 0.0006, "num_tokens": 1514893.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 93.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.021629631519317627, "kl": 0.0053508952260017395, "learning_rate": 2.7383333333333335e-06, "loss": 0.0003, "num_tokens": 1515129.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 93.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.221238374710083, "kl": 0.13612139225006104, "learning_rate": 2.7377777777777783e-06, "loss": 0.0576, "num_tokens": 1515448.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 93.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.040944620966911316, "kl": 0.010364380665123463, "learning_rate": 2.7372222222222222e-06, "loss": 0.0005, "num_tokens": 1515720.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 93.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.245666265487671, "kl": 0.03398558124899864, "learning_rate": 2.736666666666667e-06, "loss": 0.0624, "num_tokens": 1515998.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 94.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.032271817326545715, "kl": 0.008030255790799856, "learning_rate": 2.7361111111111118e-06, "loss": 0.0004, "num_tokens": 1516301.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 94.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.045377448201179504, "kl": 0.008919976186007261, "learning_rate": 2.7355555555555557e-06, "loss": 0.0004, "num_tokens": 1516573.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 94.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.022847110405564308, "kl": 0.003971627214923501, "learning_rate": 2.7350000000000005e-06, "loss": 0.0002, "num_tokens": 1516857.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 94.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05203114449977875, "kl": 0.013126606587320566, "learning_rate": 2.7344444444444444e-06, "loss": 0.0007, "num_tokens": 1517193.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 94.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05017625913023949, "kl": 0.018031428568065166, "learning_rate": 2.733888888888889e-06, "loss": 0.0009, "num_tokens": 1517461.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 94.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.57865571975708, "kl": 0.1412714533507824, "learning_rate": 2.7333333333333336e-06, "loss": 0.1645, "num_tokens": 1517752.0, "reward": 4.375, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 3.902456521987915, "step": 5081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 94.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.9207814335823059, "kl": 0.09312665276229382, "learning_rate": 2.732777777777778e-06, "loss": 0.0047, "num_tokens": 1518045.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 94.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.021638095378875732, "kl": 0.000819772481918335, "learning_rate": 2.7322222222222223e-06, "loss": 0.0, "num_tokens": 1518257.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 94.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06472151726484299, "kl": 0.04979990981519222, "learning_rate": 2.731666666666667e-06, "loss": 0.0026, "num_tokens": 1518569.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 94.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.06489749252796173, "kl": 0.14892025291919708, "learning_rate": 2.7311111111111114e-06, "loss": 0.0075, "num_tokens": 1518905.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 94.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.33348149061203003, "kl": 0.10095112398266792, "learning_rate": 2.7305555555555558e-06, "loss": 0.005, "num_tokens": 1519362.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 94.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.31241053342819214, "kl": 0.15200291946530342, "learning_rate": 2.7300000000000005e-06, "loss": 0.0077, "num_tokens": 1519700.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 94.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.13927938044071198, "kl": 0.023017332423478365, "learning_rate": 2.7294444444444445e-06, "loss": 0.0012, "num_tokens": 1520011.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 94.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.41052716970443726, "kl": 0.087740458548069, "learning_rate": 2.7288888888888893e-06, "loss": 0.0059, "num_tokens": 1520285.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 94.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.3697167932987213, "kl": 0.08949605375528336, "learning_rate": 2.728333333333333e-06, "loss": 0.0046, "num_tokens": 1520606.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 94.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.399670124053955, "kl": 0.0282521634362638, "learning_rate": 2.727777777777778e-06, "loss": -0.0063, "num_tokens": 1520962.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 5091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 94.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03988178074359894, "kl": 0.009581414517015219, "learning_rate": 2.7272222222222223e-06, "loss": 0.0005, "num_tokens": 1521252.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 94.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.007986979559063911, "kl": 0.0005770095449406654, "learning_rate": 2.726666666666667e-06, "loss": 0.0, "num_tokens": 1521548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 94.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03735242038965225, "kl": 0.0024706152180442587, "learning_rate": 2.7261111111111115e-06, "loss": 0.0001, "num_tokens": 1521826.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 94.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04565398022532463, "kl": 0.014393480494618416, "learning_rate": 2.725555555555556e-06, "loss": 0.0007, "num_tokens": 1522143.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 94.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.010704404674470425, "kl": 0.0014253308763727546, "learning_rate": 2.7250000000000006e-06, "loss": 0.0001, "num_tokens": 1522465.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 94.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05044044181704521, "kl": 0.004799169284524396, "learning_rate": 2.7244444444444445e-06, "loss": 0.0002, "num_tokens": 1522699.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 94.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.030061841011047363, "kl": 0.005711067235097289, "learning_rate": 2.7238888888888893e-06, "loss": 0.0003, "num_tokens": 1523004.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 94.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02074606530368328, "kl": 0.0007308632193598896, "learning_rate": 2.7233333333333332e-06, "loss": 0.0, "num_tokens": 1523223.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 94.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03621511161327362, "kl": 0.020793859846889973, "learning_rate": 2.722777777777778e-06, "loss": 0.001, "num_tokens": 1523497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 94.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07461759448051453, "kl": 0.12281503155827522, "learning_rate": 2.7222222222222224e-06, "loss": 0.0062, "num_tokens": 1523817.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 94.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.1177162155508995, "kl": 0.024592367932200432, "learning_rate": 2.7216666666666667e-06, "loss": 0.0013, "num_tokens": 1524062.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 94.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.30617576837539673, "kl": 0.03608645871281624, "learning_rate": 2.7211111111111115e-06, "loss": 0.0018, "num_tokens": 1524394.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 94.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.041960205882787704, "kl": 0.0022832758259028196, "learning_rate": 2.720555555555556e-06, "loss": 0.0001, "num_tokens": 1524656.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 94.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.002175509463995695, "kl": 0.09051816537976265, "learning_rate": 2.7200000000000002e-06, "loss": 0.0045, "num_tokens": 1525020.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 94.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.001294418703764677, "kl": 0.0017289647366851568, "learning_rate": 2.7194444444444446e-06, "loss": 0.0001, "num_tokens": 1525300.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 94.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.000547032686881721, "kl": 0.21367286890745163, "learning_rate": 2.7188888888888894e-06, "loss": 0.0107, "num_tokens": 1525604.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 94.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.14200136065483093, "kl": 0.09847941249608994, "learning_rate": 2.7183333333333333e-06, "loss": 0.0049, "num_tokens": 1525977.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 94.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.09945845603942871, "kl": 0.14622347801923752, "learning_rate": 2.717777777777778e-06, "loss": 0.0073, "num_tokens": 1526287.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 94.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.054994549602270126, "kl": 0.003649118007160723, "learning_rate": 2.717222222222222e-06, "loss": 0.0002, "num_tokens": 1526555.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 94.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00574670871719718, "kl": 0.0004409164102980867, "learning_rate": 2.7166666666666668e-06, "loss": 0.0, "num_tokens": 1526815.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 94.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 1.0546749830245972, "kl": 0.13970889896154404, "learning_rate": 2.7161111111111116e-06, "loss": 0.0071, "num_tokens": 1527088.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 94.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 5.071977615356445, "kl": 0.29183967038989067, "learning_rate": 2.715555555555556e-06, "loss": -0.0023, "num_tokens": 1527393.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 94.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06518138200044632, "kl": 0.027697831392288208, "learning_rate": 2.7150000000000003e-06, "loss": 0.0015, "num_tokens": 1527772.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 94.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.059660378843545914, "kl": 0.029819983057677746, "learning_rate": 2.7144444444444446e-06, "loss": 0.0015, "num_tokens": 1528125.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 94.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.032391563057899475, "kl": 0.005994449253194034, "learning_rate": 2.7138888888888894e-06, "loss": 0.0003, "num_tokens": 1528434.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 94.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.091433584690094, "kl": 0.020775836892426014, "learning_rate": 2.7133333333333333e-06, "loss": 0.001, "num_tokens": 1528759.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 94.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.022147785872220993, "kl": 0.013070298358798027, "learning_rate": 2.712777777777778e-06, "loss": 0.0007, "num_tokens": 1529019.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 94.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.07871592044830322, "kl": 0.027066207490861416, "learning_rate": 2.712222222222222e-06, "loss": 0.0014, "num_tokens": 1529309.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 94.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.1978510320186615, "kl": 0.015465840697288513, "learning_rate": 2.711666666666667e-06, "loss": 0.001, "num_tokens": 1529517.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 94.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04907754063606262, "kl": 0.0063516744412481785, "learning_rate": 2.7111111111111116e-06, "loss": 0.0003, "num_tokens": 1529789.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 94.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014504193095490336, "kl": 0.00016583799879299477, "learning_rate": 2.7105555555555555e-06, "loss": 0.0, "num_tokens": 1530045.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 94.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 5.8545821957523e-05, "kl": 5.938112735748291e-06, "learning_rate": 2.7100000000000003e-06, "loss": 0.0, "num_tokens": 1530265.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 94.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 15.22216796875, "kl": 0.04695988819003105, "learning_rate": 2.7094444444444447e-06, "loss": 0.1112, "num_tokens": 1530503.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 94.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.24055816233158112, "kl": 0.06025035306811333, "learning_rate": 2.708888888888889e-06, "loss": 0.0038, "num_tokens": 1530729.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 94.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.007017161231487989, "kl": 0.0008329898118972778, "learning_rate": 2.7083333333333334e-06, "loss": 0.0, "num_tokens": 1530941.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 94.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.017186185345053673, "kl": 0.012737659737467766, "learning_rate": 2.707777777777778e-06, "loss": 0.0006, "num_tokens": 1531253.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 94.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.6202879548072815, "kl": 0.08613291848450899, "learning_rate": 2.707222222222222e-06, "loss": 0.0042, "num_tokens": 1531542.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 94.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004584162496030331, "kl": 0.004159983247518539, "learning_rate": 2.706666666666667e-06, "loss": 0.0002, "num_tokens": 1531802.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 95.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.020835664123296738, "kl": 0.016078890301287174, "learning_rate": 2.7061111111111116e-06, "loss": 0.0008, "num_tokens": 1532094.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 95.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01049867458641529, "kl": 0.00029773928690701723, "learning_rate": 2.7055555555555556e-06, "loss": 0.0, "num_tokens": 1532351.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 95.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.4211808145046234, "kl": 0.08587116748094559, "learning_rate": 2.7050000000000004e-06, "loss": 0.0047, "num_tokens": 1532649.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 95.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.140800476074219, "kl": 0.024846727959811687, "learning_rate": 2.7044444444444447e-06, "loss": 0.0448, "num_tokens": 1532929.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 95.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.008310263976454735, "kl": 0.0019868214149028063, "learning_rate": 2.703888888888889e-06, "loss": 0.0001, "num_tokens": 1533241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 95.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.011715807020664215, "kl": 0.0006526976794702932, "learning_rate": 2.7033333333333334e-06, "loss": 0.0, "num_tokens": 1533497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 95.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02240549400448799, "kl": 0.012891971040517092, "learning_rate": 2.702777777777778e-06, "loss": 0.0006, "num_tokens": 1533757.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 95.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10754149407148361, "kl": 0.012707278598099947, "learning_rate": 2.702222222222222e-06, "loss": 0.0007, "num_tokens": 1534026.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 95.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.010410990566015244, "kl": 0.24053358286619186, "learning_rate": 2.701666666666667e-06, "loss": 0.0119, "num_tokens": 1534326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 95.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.12330359220504761, "kl": 0.06307770684361458, "learning_rate": 2.7011111111111117e-06, "loss": 0.003, "num_tokens": 1534648.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 95.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0056348820216953754, "kl": 0.0009960159659385681, "learning_rate": 2.7005555555555556e-06, "loss": 0.0, "num_tokens": 1534892.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 95.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.05467912554740906, "kl": 0.004003889858722687, "learning_rate": 2.7000000000000004e-06, "loss": 0.0003, "num_tokens": 1535106.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 95.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06453946977853775, "kl": 0.003629423677921295, "learning_rate": 2.6994444444444443e-06, "loss": 0.0002, "num_tokens": 1535312.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 95.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.005162880755960941, "kl": 0.00045668672828469425, "learning_rate": 2.698888888888889e-06, "loss": 0.0, "num_tokens": 1535608.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 95.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.023905083537101746, "kl": 0.004891094344202429, "learning_rate": 2.6983333333333335e-06, "loss": 0.0003, "num_tokens": 1535897.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 95.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.4196465015411377, "kl": 0.02401343476958573, "learning_rate": 2.6977777777777783e-06, "loss": 0.129, "num_tokens": 1536178.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 95.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 0.6888519525527954, "kl": 0.03912303131073713, "learning_rate": 2.697222222222222e-06, "loss": 0.0004, "num_tokens": 1536511.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 95.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.305654376745224, "kl": 0.08826220035552979, "learning_rate": 2.696666666666667e-06, "loss": 0.0046, "num_tokens": 1536832.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 95.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03177953511476517, "kl": 0.17166122794151306, "learning_rate": 2.6961111111111117e-06, "loss": 0.0086, "num_tokens": 1537141.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 95.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0053429570980370045, "kl": 0.004011530429124832, "learning_rate": 2.6955555555555557e-06, "loss": 0.0002, "num_tokens": 1537401.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 95.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.003548734588548541, "kl": 0.0003013014793395996, "learning_rate": 2.6950000000000005e-06, "loss": 0.0, "num_tokens": 1537637.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 95.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.14899392426013947, "kl": 0.04979510884732008, "learning_rate": 2.6944444444444444e-06, "loss": 0.0027, "num_tokens": 1537908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 95.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.10415305942296982, "kl": 0.026614608243107796, "learning_rate": 2.693888888888889e-06, "loss": 0.0013, "num_tokens": 1538238.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 95.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.5177643299102783, "kl": 0.03529911860823631, "learning_rate": 2.6933333333333335e-06, "loss": -0.1606, "num_tokens": 1538579.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 5153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 95.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03357991948723793, "kl": 0.08984877914190292, "learning_rate": 2.692777777777778e-06, "loss": 0.0045, "num_tokens": 1538928.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 95.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.008041935041546822, "kl": 0.0003709718585014343, "learning_rate": 2.6922222222222222e-06, "loss": 0.0, "num_tokens": 1539140.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 95.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.0399370193481445, "kl": 0.04030406381934881, "learning_rate": 2.691666666666667e-06, "loss": -0.0286, "num_tokens": 1539457.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 5156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 95.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.023601077497005463, "kl": 0.005449533462524414, "learning_rate": 2.6911111111111114e-06, "loss": 0.0003, "num_tokens": 1539751.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 95.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.012285791337490082, "kl": 0.0084146186709404, "learning_rate": 2.6905555555555557e-06, "loss": 0.0004, "num_tokens": 1539987.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 95.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010140719823539257, "kl": 0.0017046757857315242, "learning_rate": 2.6900000000000005e-06, "loss": 0.0001, "num_tokens": 1540267.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 95.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.025273457169532776, "kl": 0.012261521071195602, "learning_rate": 2.6894444444444444e-06, "loss": 0.0006, "num_tokens": 1540579.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 95.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09776614606380463, "kl": 0.03166163992136717, "learning_rate": 2.6888888888888892e-06, "loss": 0.0016, "num_tokens": 1540880.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 95.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.313704490661621, "kl": 0.1536504477262497, "learning_rate": 2.688333333333333e-06, "loss": 0.1968, "num_tokens": 1541249.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 95.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.7991347312927246, "kl": 0.2554139159619808, "learning_rate": 2.687777777777778e-06, "loss": -0.0185, "num_tokens": 1541587.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 95.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.513956069946289, "kl": 0.09345235675573349, "learning_rate": 2.6872222222222223e-06, "loss": 0.0582, "num_tokens": 1541879.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 95.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.094595432281494, "kl": 0.11050845310091972, "learning_rate": 2.686666666666667e-06, "loss": 0.0391, "num_tokens": 1542226.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 95.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859253853559494, "kl": 0.007360442075878382, "learning_rate": 2.6861111111111114e-06, "loss": 0.0002, "num_tokens": 1542474.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 95.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05476067215204239, "kl": 0.009593709837645292, "learning_rate": 2.6855555555555558e-06, "loss": 0.0005, "num_tokens": 1542747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 95.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 11.16102409362793, "kl": 0.03197764232754707, "learning_rate": 2.6850000000000006e-06, "loss": -0.156, "num_tokens": 1542965.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 5168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 95.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03461683541536331, "kl": 0.0014493025373667479, "learning_rate": 2.6844444444444445e-06, "loss": 0.0001, "num_tokens": 1543239.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 95.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.29944658279418945, "kl": 0.046230765990912914, "learning_rate": 2.6838888888888893e-06, "loss": 0.0026, "num_tokens": 1543586.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 95.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0635107234120369, "kl": 0.09330756589770317, "learning_rate": 2.683333333333333e-06, "loss": 0.0047, "num_tokens": 1543906.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 95.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.023097379133105278, "kl": 0.004328008275479078, "learning_rate": 2.682777777777778e-06, "loss": 0.0002, "num_tokens": 1544188.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 95.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 5.310420601745136e-05, "kl": 5.416572093963623e-06, "learning_rate": 2.6822222222222223e-06, "loss": 0.0, "num_tokens": 1544408.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 95.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.9326765537261963, "kl": 0.03781689237803221, "learning_rate": 2.6816666666666667e-06, "loss": -0.036, "num_tokens": 1544683.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 95.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.7380534410476685, "kl": 0.09791964944452047, "learning_rate": 2.6811111111111115e-06, "loss": 0.005, "num_tokens": 1545017.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 95.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07715742290019989, "kl": 0.09614670276641846, "learning_rate": 2.680555555555556e-06, "loss": 0.0048, "num_tokens": 1545382.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 95.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03134707733988762, "kl": 0.04120118170976639, "learning_rate": 2.68e-06, "loss": 0.0021, "num_tokens": 1545759.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 95.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.009286812506616116, "kl": 0.0012515562120825052, "learning_rate": 2.6794444444444445e-06, "loss": 0.0001, "num_tokens": 1546083.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 95.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0203386377543211, "kl": 0.011651622597128153, "learning_rate": 2.6788888888888893e-06, "loss": 0.0006, "num_tokens": 1546355.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 95.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.040718406438827515, "kl": 0.013050522422417998, "learning_rate": 2.6783333333333332e-06, "loss": 0.0007, "num_tokens": 1546642.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 95.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 10.368769645690918, "kl": 1.5806638412177563, "learning_rate": 2.677777777777778e-06, "loss": 0.4349, "num_tokens": 1546882.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 5181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 95.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.026551157236099243, "kl": 0.01567722111940384, "learning_rate": 2.6772222222222224e-06, "loss": 0.0008, "num_tokens": 1547176.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 95.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07476551830768585, "kl": 0.008131411857903004, "learning_rate": 2.6766666666666667e-06, "loss": 0.0004, "num_tokens": 1547448.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 96.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.048939306288957596, "kl": 0.05383583344519138, "learning_rate": 2.6761111111111115e-06, "loss": 0.0027, "num_tokens": 1547910.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.012817786075174809, "kl": 0.25326110422611237, "learning_rate": 2.675555555555556e-06, "loss": 0.0126, "num_tokens": 1548208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 96.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.9252077341079712, "kl": 0.10701891779899597, "learning_rate": 2.6750000000000002e-06, "loss": -0.0345, "num_tokens": 1548526.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 96.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.8944149017333984, "kl": 0.18013645708560944, "learning_rate": 2.6744444444444446e-06, "loss": -0.0005, "num_tokens": 1548883.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 5187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 96.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.015605159103870392, "kl": 0.002008420415222645, "learning_rate": 2.6738888888888894e-06, "loss": 0.0001, "num_tokens": 1549141.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 96.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05524091795086861, "kl": 0.05911489576101303, "learning_rate": 2.6733333333333333e-06, "loss": 0.0029, "num_tokens": 1549492.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 96.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.3644173741340637, "kl": 0.14909441024065018, "learning_rate": 2.672777777777778e-06, "loss": 0.0076, "num_tokens": 1549834.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02624943107366562, "kl": 0.002978052361868322, "learning_rate": 2.672222222222223e-06, "loss": 0.0001, "num_tokens": 1550102.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 96.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01302744448184967, "kl": 0.008082307875156403, "learning_rate": 2.6716666666666668e-06, "loss": 0.0004, "num_tokens": 1550338.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 96.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.099194049835205, "kl": 0.007626082049682736, "learning_rate": 2.6711111111111116e-06, "loss": -0.0682, "num_tokens": 1550641.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 96.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.057060305029153824, "kl": 0.0063942670822143555, "learning_rate": 2.6705555555555555e-06, "loss": 0.0003, "num_tokens": 1550875.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 96.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 2.8585731342900544e-05, "kl": 4.656612873077393e-06, "learning_rate": 2.6700000000000003e-06, "loss": 0.0, "num_tokens": 1551095.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 96.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.21543686091899872, "kl": 0.05179929547011852, "learning_rate": 2.6694444444444446e-06, "loss": 0.0026, "num_tokens": 1551387.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 96.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 6.069647789001465, "kl": 0.04662082064896822, "learning_rate": 2.6688888888888894e-06, "loss": -0.0476, "num_tokens": 1551715.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 96.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.16034391522407532, "kl": 0.08528724312782288, "learning_rate": 2.6683333333333333e-06, "loss": 0.0043, "num_tokens": 1552027.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 1.5359152555465698, "kl": 0.3774709105491638, "learning_rate": 2.667777777777778e-06, "loss": 0.0208, "num_tokens": 1552314.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 96.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 2.3686630725860596, "kl": 0.13754202332347631, "learning_rate": 2.667222222222223e-06, "loss": 0.0072, "num_tokens": 1552588.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 96.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.006596114486455917, "kl": 0.0003142167115584016, "learning_rate": 2.666666666666667e-06, "loss": 0.0, "num_tokens": 1552884.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 96.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.015656165778636932, "kl": 0.00023699402299826033, "learning_rate": 2.6661111111111116e-06, "loss": 0.0, "num_tokens": 1553140.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 96.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03217431157827377, "kl": 0.010335217695683241, "learning_rate": 2.6655555555555555e-06, "loss": 0.0005, "num_tokens": 1553456.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.16530029475688934, "kl": 0.045491454657167196, "learning_rate": 2.6650000000000003e-06, "loss": 0.0022, "num_tokens": 1553747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 96.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07952563464641571, "kl": 0.007910534739494324, "learning_rate": 2.6644444444444447e-06, "loss": 0.0004, "num_tokens": 1553957.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.010416666977107525, "clip_ratio/region_mean": 0.010416666977107525, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 96.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.464784145355225, "kl": 0.057731520384550095, "learning_rate": 2.663888888888889e-06, "loss": 0.3446, "num_tokens": 1554264.0, "reward": 5.875, "reward_std": 2.136000871658325, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 2.136000871658325, "step": 5206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 96.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.960318088531494, "kl": 0.11376459896564484, "learning_rate": 2.6633333333333334e-06, "loss": 0.2814, "num_tokens": 1554661.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 5207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.007667398080229759, "kl": 0.0005407056887634099, "learning_rate": 2.662777777777778e-06, "loss": 0.0, "num_tokens": 1554949.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 96.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.32768693566322327, "kl": 0.08253926411271095, "learning_rate": 2.6622222222222225e-06, "loss": 0.0039, "num_tokens": 1555222.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 96.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.010443628765642643, "kl": 0.0018765454879030585, "learning_rate": 2.661666666666667e-06, "loss": 0.0001, "num_tokens": 1555494.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.2353602796792984, "kl": 0.06559664942324162, "learning_rate": 2.6611111111111117e-06, "loss": 0.0031, "num_tokens": 1555788.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 96.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03004058077931404, "kl": 0.004487856858759187, "learning_rate": 2.6605555555555556e-06, "loss": 0.0002, "num_tokens": 1556097.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 96.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0576256662607193, "kl": 0.11848078668117523, "learning_rate": 2.6600000000000004e-06, "loss": 0.0059, "num_tokens": 1556421.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 96.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030246558599174023, "kl": 0.09019438177347183, "learning_rate": 2.6594444444444443e-06, "loss": 0.0045, "num_tokens": 1556785.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 96.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023792185820639133, "kl": 8.046627044677734e-05, "learning_rate": 2.658888888888889e-06, "loss": 0.0, "num_tokens": 1556997.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 96.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.020393269136548042, "kl": 0.004494961351156235, "learning_rate": 2.6583333333333334e-06, "loss": 0.0002, "num_tokens": 1557257.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 96.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.024603432044386864, "kl": 0.037120357155799866, "learning_rate": 2.6577777777777782e-06, "loss": 0.0019, "num_tokens": 1557619.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.2159523963928223, "kl": 0.03665372170507908, "learning_rate": 2.6572222222222226e-06, "loss": 0.0609, "num_tokens": 1557903.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 96.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01847653277218342, "kl": 0.012469840236008167, "learning_rate": 2.656666666666667e-06, "loss": 0.0006, "num_tokens": 1558235.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.006091555114835501, "kl": 0.0011536746169440448, "learning_rate": 2.6561111111111117e-06, "loss": 0.0001, "num_tokens": 1558556.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.032279122620821, "kl": 0.006264382274821401, "learning_rate": 2.6555555555555556e-06, "loss": 0.0003, "num_tokens": 1558838.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 96.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.02717607468366623, "kl": 0.00863631209358573, "learning_rate": 2.6550000000000004e-06, "loss": 0.0004, "num_tokens": 1559106.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 96.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.041243914514780045, "kl": 0.00860420148819685, "learning_rate": 2.6544444444444443e-06, "loss": 0.0005, "num_tokens": 1559374.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 96.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 1.451021671295166, "kl": 0.02928977645933628, "learning_rate": 2.653888888888889e-06, "loss": -0.0374, "num_tokens": 1559705.0, "reward": 6.75, "reward_std": 1.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 1.5, "step": 5224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 96.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 6.289809226989746, "kl": 0.09212920255959034, "learning_rate": 2.6533333333333335e-06, "loss": 0.1752, "num_tokens": 1559932.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 96.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.009704016149044037, "kl": 0.0012093819677829742, "learning_rate": 2.652777777777778e-06, "loss": 0.0001, "num_tokens": 1560176.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 96.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.00407030014321208, "kl": 0.00014483928680419922, "learning_rate": 2.6522222222222226e-06, "loss": 0.0, "num_tokens": 1560388.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 96.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.241506814956665, "kl": 0.648064979352057, "learning_rate": 2.651666666666667e-06, "loss": 0.0328, "num_tokens": 1560736.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 5228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 96.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.004747048020362854, "kl": 0.00019425153732299805, "learning_rate": 2.6511111111111113e-06, "loss": 0.0, "num_tokens": 1560956.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 96.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.053162846714258194, "kl": 0.013943901285529137, "learning_rate": 2.6505555555555557e-06, "loss": 0.0007, "num_tokens": 1561288.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5230 }, { "clip_ratio/high_max": 0.012195121496915817, "clip_ratio/high_mean": 0.012195121496915817, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012195121496915817, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.2178096771240234, "kl": 0.07577953487634659, "learning_rate": 2.6500000000000005e-06, "loss": 0.1798, "num_tokens": 1561592.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 5231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 96.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.008831063285470009, "kl": 0.0005673936102539301, "learning_rate": 2.6494444444444444e-06, "loss": 0.0, "num_tokens": 1561872.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 96.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 12.16086483001709, "kl": 0.5042058825492859, "learning_rate": 2.648888888888889e-06, "loss": 0.1685, "num_tokens": 1562123.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 5233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 96.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05867487192153931, "kl": 0.02436648216098547, "learning_rate": 2.648333333333333e-06, "loss": 0.0013, "num_tokens": 1562384.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 96.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.02800583466887474, "kl": 0.03133091703057289, "learning_rate": 2.647777777777778e-06, "loss": 0.0016, "num_tokens": 1562676.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 96.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04634995013475418, "kl": 0.05088967829942703, "learning_rate": 2.6472222222222227e-06, "loss": 0.0026, "num_tokens": 1563148.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 96.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01834704540669918, "kl": 0.011761944741010666, "learning_rate": 2.646666666666667e-06, "loss": 0.0006, "num_tokens": 1563460.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 97.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0950411856174469, "kl": 0.12692714855074883, "learning_rate": 2.6461111111111114e-06, "loss": 0.0063, "num_tokens": 1563770.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 97.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.21450300514698029, "kl": 0.033794681541621685, "learning_rate": 2.6455555555555557e-06, "loss": 0.0017, "num_tokens": 1564059.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 97.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.14093554019927979, "kl": 0.023287316784262657, "learning_rate": 2.6450000000000005e-06, "loss": 0.001, "num_tokens": 1564326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 97.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05571784824132919, "kl": 0.08174257725477219, "learning_rate": 2.6444444444444444e-06, "loss": 0.0041, "num_tokens": 1564639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.010931800119578838, "kl": 0.030269593000411987, "learning_rate": 2.6438888888888892e-06, "loss": 0.0015, "num_tokens": 1564855.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 97.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.041235025972127914, "kl": 0.011523023247718811, "learning_rate": 2.643333333333333e-06, "loss": 0.0006, "num_tokens": 1565193.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02844271808862686, "kl": 0.030960746109485626, "learning_rate": 2.642777777777778e-06, "loss": 0.0016, "num_tokens": 1565485.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.75, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 97.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.48405122756958, "kl": 0.05236276611685753, "learning_rate": 2.6422222222222227e-06, "loss": 0.0876, "num_tokens": 1565904.0, "reward": 3.0, "reward_std": 5.446711540222168, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 5.446711540222168, "step": 5245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 97.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.048367127776145935, "kl": 0.06876371055841446, "learning_rate": 2.6416666666666666e-06, "loss": 0.0036, "num_tokens": 1566182.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 97.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.7172064781188965, "kl": 0.15306413918733597, "learning_rate": 2.6411111111111114e-06, "loss": 0.1448, "num_tokens": 1566536.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 5247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 97.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08599454909563065, "kl": 0.03295975923538208, "learning_rate": 2.6405555555555558e-06, "loss": 0.0017, "num_tokens": 1566830.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 97.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.09488624334335327, "kl": 0.027911939658224583, "learning_rate": 2.64e-06, "loss": 0.0014, "num_tokens": 1567123.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 77.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 77.75, "completions/mean_terminated_length": 18.33333396911621, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 97.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.060269832611084, "kl": 0.02901238016784191, "learning_rate": 2.6394444444444445e-06, "loss": 0.4653, "num_tokens": 1567654.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012934870028402656, "kl": 6.6086649894714355e-06, "learning_rate": 2.6388888888888893e-06, "loss": 0.0, "num_tokens": 1567874.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 97.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.033636726438999176, "kl": 0.00873418990522623, "learning_rate": 2.638333333333333e-06, "loss": 0.0004, "num_tokens": 1568134.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 97.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09931565076112747, "kl": 0.07424584776163101, "learning_rate": 2.637777777777778e-06, "loss": 0.0035, "num_tokens": 1568444.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 97.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.02473601885139942, "kl": 0.023942294530570507, "learning_rate": 2.6372222222222228e-06, "loss": 0.0012, "num_tokens": 1568771.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.2105756253004074, "kl": 0.0398885328322649, "learning_rate": 2.6366666666666667e-06, "loss": 0.0024, "num_tokens": 1569053.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 97.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03682131692767143, "kl": 0.007574969269626308, "learning_rate": 2.6361111111111115e-06, "loss": 0.0004, "num_tokens": 1569351.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 97.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.7801291942596436, "kl": 0.06260751653462648, "learning_rate": 2.635555555555556e-06, "loss": 0.0914, "num_tokens": 1569677.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 5257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 97.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.013431240804493427, "kl": 0.004170015454292297, "learning_rate": 2.635e-06, "loss": 0.0002, "num_tokens": 1569885.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 97.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.7680702209472656, "kl": 0.22968391329050064, "learning_rate": 2.6344444444444445e-06, "loss": 0.2786, "num_tokens": 1570305.0, "reward": 5.175000190734863, "reward_std": 2.6874709129333496, "rewards/reward_combined/mean": 5.175000190734863, "rewards/reward_combined/std": 2.6874709129333496, "step": 5259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 97.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035163206048309803, "kl": 0.00011877715587615967, "learning_rate": 2.6338888888888893e-06, "loss": 0.0, "num_tokens": 1570517.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 97.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05352067947387695, "kl": 0.07958431169390678, "learning_rate": 2.6333333333333332e-06, "loss": 0.004, "num_tokens": 1570882.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.13071097433567047, "kl": 0.009309619665145874, "learning_rate": 2.632777777777778e-06, "loss": 0.0003, "num_tokens": 1571130.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 97.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016317830886691809, "kl": 0.0010010182741098106, "learning_rate": 2.632222222222223e-06, "loss": 0.0, "num_tokens": 1571451.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 97.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.851686477661133, "kl": 0.19517676532268524, "learning_rate": 2.6316666666666667e-06, "loss": 0.0151, "num_tokens": 1571769.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.5, "frac_reward_zero_std": 0.0, "grad_norm": 5.115662097930908, "kl": 0.32100851833820343, "learning_rate": 2.6311111111111115e-06, "loss": -0.0105, "num_tokens": 1572067.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 97.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03663913905620575, "kl": 0.007837854791432619, "learning_rate": 2.6305555555555555e-06, "loss": 0.0004, "num_tokens": 1572395.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05995328724384308, "kl": 0.015248036361299455, "learning_rate": 2.6300000000000002e-06, "loss": 0.0008, "num_tokens": 1572693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 97.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0856560543179512, "kl": 0.1735595464706421, "learning_rate": 2.6294444444444446e-06, "loss": 0.0087, "num_tokens": 1573002.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 97.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.010068508796393871, "kl": 0.0013144314289093018, "learning_rate": 2.6288888888888894e-06, "loss": 0.0001, "num_tokens": 1573246.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.02298484742641449, "kl": 0.017185616306960583, "learning_rate": 2.6283333333333333e-06, "loss": 0.0009, "num_tokens": 1573522.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.10731223225593567, "kl": 0.012737320503219962, "learning_rate": 2.627777777777778e-06, "loss": 0.0006, "num_tokens": 1573806.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 97.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.4175550639629364, "kl": 0.1220523901283741, "learning_rate": 2.627222222222223e-06, "loss": 0.0062, "num_tokens": 1574172.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 97.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1519065648317337, "kl": 0.019244064576923847, "learning_rate": 2.6266666666666668e-06, "loss": 0.0009, "num_tokens": 1574442.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.012791144661605358, "kl": 0.00792289525270462, "learning_rate": 2.6261111111111116e-06, "loss": 0.0004, "num_tokens": 1574678.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 97.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03611094877123833, "kl": 0.00405448826495558, "learning_rate": 2.6255555555555555e-06, "loss": 0.0002, "num_tokens": 1574942.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 97.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.24484200775623322, "kl": 0.0513051338493824, "learning_rate": 2.6250000000000003e-06, "loss": 0.0025, "num_tokens": 1575290.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.12071605771780014, "kl": 0.04621961526572704, "learning_rate": 2.6244444444444446e-06, "loss": 0.0023, "num_tokens": 1575565.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 97.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02188616245985031, "kl": 0.004320114850997925, "learning_rate": 2.623888888888889e-06, "loss": 0.0002, "num_tokens": 1575825.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 97.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.024227354675531387, "kl": 0.008266508113592863, "learning_rate": 2.6233333333333333e-06, "loss": 0.0004, "num_tokens": 1576100.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 97.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01913817599415779, "kl": 0.011511143296957016, "learning_rate": 2.622777777777778e-06, "loss": 0.0006, "num_tokens": 1576412.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 97.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010421725455671549, "kl": 0.00037610859726555645, "learning_rate": 2.6222222222222225e-06, "loss": 0.0, "num_tokens": 1576724.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 97.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.1010489463806152, "kl": 0.19975921511650085, "learning_rate": 2.621666666666667e-06, "loss": 0.0011, "num_tokens": 1577057.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 97.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.004265529569238424, "kl": 0.00010394304990768433, "learning_rate": 2.6211111111111116e-06, "loss": 0.0, "num_tokens": 1577269.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 97.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03010665439069271, "kl": 0.0017651618254603818, "learning_rate": 2.6205555555555555e-06, "loss": 0.0001, "num_tokens": 1577503.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 97.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.015599505975842476, "kl": 0.004013729252619669, "learning_rate": 2.6200000000000003e-06, "loss": 0.0002, "num_tokens": 1577790.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 97.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06686630845069885, "kl": 0.060893481597304344, "learning_rate": 2.6194444444444443e-06, "loss": 0.003, "num_tokens": 1578238.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 97.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05716417729854584, "kl": 0.025527273770421743, "learning_rate": 2.618888888888889e-06, "loss": 0.0015, "num_tokens": 1578526.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 97.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.6688497066497803, "kl": 0.04853660613298416, "learning_rate": 2.6183333333333334e-06, "loss": 0.1219, "num_tokens": 1578866.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 97.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.8505613803863525, "kl": 0.05192996095865965, "learning_rate": 2.617777777777778e-06, "loss": 0.1835, "num_tokens": 1579155.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 97.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.014293773099780083, "kl": 0.0010752060334198177, "learning_rate": 2.6172222222222225e-06, "loss": 0.0001, "num_tokens": 1579431.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 97.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 7.284572601318359, "kl": 0.011709637197782286, "learning_rate": 2.616666666666667e-06, "loss": 0.259, "num_tokens": 1579662.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.003976719919592142, "kl": 0.0002015858917729929, "learning_rate": 2.6161111111111117e-06, "loss": 0.0, "num_tokens": 1579918.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04077788442373276, "kl": 0.009240259358193725, "learning_rate": 2.6155555555555556e-06, "loss": 0.0004, "num_tokens": 1580220.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 98.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.0094765424728394, "kl": 0.11245035380125046, "learning_rate": 2.6150000000000004e-06, "loss": -0.0493, "num_tokens": 1580682.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 5294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 98.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.1711111217737198, "kl": 0.03863963345065713, "learning_rate": 2.6144444444444443e-06, "loss": 0.0021, "num_tokens": 1581014.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 98.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.1298463344573975, "kl": 0.10672101378440857, "learning_rate": 2.613888888888889e-06, "loss": 0.0163, "num_tokens": 1581356.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 98.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.11039961874485016, "kl": 0.019885845482349396, "learning_rate": 2.6133333333333334e-06, "loss": 0.0012, "num_tokens": 1581632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 98.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.005173861049115658, "kl": 0.0894763134419918, "learning_rate": 2.612777777777778e-06, "loss": 0.0045, "num_tokens": 1581996.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.028543241322040558, "kl": 0.03729970660060644, "learning_rate": 2.6122222222222226e-06, "loss": 0.002, "num_tokens": 1582268.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 98.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.010320565663278103, "kl": 0.0013067677500657737, "learning_rate": 2.611666666666667e-06, "loss": 0.0001, "num_tokens": 1582589.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.06600894778966904, "kl": 0.017950857058167458, "learning_rate": 2.6111111111111113e-06, "loss": 0.0009, "num_tokens": 1582885.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 98.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.07802323251962662, "kl": 0.021476924885064363, "learning_rate": 2.6105555555555556e-06, "loss": 0.0011, "num_tokens": 1583224.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.6024091243743896, "kl": 0.10682373214513063, "learning_rate": 2.6100000000000004e-06, "loss": 0.0046, "num_tokens": 1583512.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 98.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07286975532770157, "kl": 0.003019238356500864, "learning_rate": 2.6094444444444444e-06, "loss": 0.0002, "num_tokens": 1583744.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.10570815950632095, "kl": 0.013178466440876946, "learning_rate": 2.608888888888889e-06, "loss": 0.001, "num_tokens": 1584035.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 98.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.3147151470184326, "kl": 0.07111096754670143, "learning_rate": 2.608333333333333e-06, "loss": -0.0611, "num_tokens": 1584364.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 98.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.037129297852516174, "kl": 0.005789139308035374, "learning_rate": 2.607777777777778e-06, "loss": 0.0003, "num_tokens": 1584654.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 98.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 6.286019325256348, "kl": 0.2919119670987129, "learning_rate": 2.6072222222222226e-06, "loss": 0.2425, "num_tokens": 1584877.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 5308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 98.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.10126867890357971, "kl": 0.0038553805788978934, "learning_rate": 2.606666666666667e-06, "loss": 0.0002, "num_tokens": 1585153.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 98.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010086777911055833, "kl": 5.751848220825195e-06, "learning_rate": 2.6061111111111113e-06, "loss": 0.0, "num_tokens": 1585373.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 98.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.3963290750980377, "kl": 0.06541671603918076, "learning_rate": 2.6055555555555557e-06, "loss": 0.0034, "num_tokens": 1585671.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 98.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.10054922848939896, "kl": 0.05922534316778183, "learning_rate": 2.6050000000000005e-06, "loss": 0.003, "num_tokens": 1586040.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 98.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.1037568598985672, "kl": 0.07468705624341965, "learning_rate": 2.6044444444444444e-06, "loss": 0.0037, "num_tokens": 1586338.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 98.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0801042690873146, "kl": 0.006144379731267691, "learning_rate": 2.603888888888889e-06, "loss": 0.0005, "num_tokens": 1586554.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.188340902328491, "kl": 0.055379705503582954, "learning_rate": 2.603333333333334e-06, "loss": -0.0054, "num_tokens": 1586843.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 5315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 98.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.01909267157316208, "kl": 0.14057591930031776, "learning_rate": 2.602777777777778e-06, "loss": 0.007, "num_tokens": 1587152.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 76.75, "completions/mean_terminated_length": 17.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 98.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.9783883094787598, "kl": 0.02379649318754673, "learning_rate": 2.6022222222222227e-06, "loss": 0.433, "num_tokens": 1587679.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 98.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.903017520904541, "kl": 0.14277822524309158, "learning_rate": 2.6016666666666666e-06, "loss": -0.0851, "num_tokens": 1587982.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 5318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 98.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.7071515321731567, "kl": 0.12765125557780266, "learning_rate": 2.6011111111111114e-06, "loss": 0.0064, "num_tokens": 1588242.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.056630007922649384, "kl": 0.008183573372662067, "learning_rate": 2.6005555555555557e-06, "loss": 0.0004, "num_tokens": 1588523.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 98.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.16346348822116852, "kl": 0.2643040716648102, "learning_rate": 2.6e-06, "loss": 0.0132, "num_tokens": 1588821.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 98.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.03304412588477135, "kl": 0.0010534331668168306, "learning_rate": 2.5994444444444444e-06, "loss": 0.0001, "num_tokens": 1589034.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 98.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07798813283443451, "kl": 0.012708049267530441, "learning_rate": 2.5988888888888892e-06, "loss": 0.0006, "num_tokens": 1589350.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04830321669578552, "kl": 0.006812514737248421, "learning_rate": 2.598333333333334e-06, "loss": 0.0004, "num_tokens": 1589614.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 98.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00163252220954746, "kl": 0.0006187149556353688, "learning_rate": 2.597777777777778e-06, "loss": 0.0, "num_tokens": 1589928.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.026182256639003754, "kl": 0.016654090955853462, "learning_rate": 2.5972222222222227e-06, "loss": 0.0008, "num_tokens": 1590204.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.004372335039079189, "kl": 0.00013599395606433973, "learning_rate": 2.5966666666666667e-06, "loss": 0.0, "num_tokens": 1590460.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 98.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.008100011385977268, "kl": 0.00912737101316452, "learning_rate": 2.5961111111111114e-06, "loss": 0.0005, "num_tokens": 1590696.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 98.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03432449325919151, "kl": 0.0022105438183643855, "learning_rate": 2.5955555555555558e-06, "loss": 0.0001, "num_tokens": 1590918.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 98.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.3652115762233734, "kl": 0.18640200048685074, "learning_rate": 2.595e-06, "loss": 0.0094, "num_tokens": 1591255.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 98.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.1973254680633545, "kl": 0.06251100450754166, "learning_rate": 2.5944444444444445e-06, "loss": -0.1536, "num_tokens": 1591626.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 5331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 98.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 2.0190091133117676, "kl": 0.26640399545431137, "learning_rate": 2.5938888888888893e-06, "loss": 0.0198, "num_tokens": 1591836.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 98.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.037024956196546555, "kl": 0.004681289196014404, "learning_rate": 2.5933333333333336e-06, "loss": 0.0002, "num_tokens": 1592108.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 98.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.33048152923584, "kl": 0.06139074917882681, "learning_rate": 2.592777777777778e-06, "loss": 0.0505, "num_tokens": 1592372.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.08792614936828613, "kl": 0.031225665472447872, "learning_rate": 2.5922222222222228e-06, "loss": 0.0015, "num_tokens": 1592651.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 98.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.04430417716503143, "kl": 0.12771649658679962, "learning_rate": 2.5916666666666667e-06, "loss": 0.0064, "num_tokens": 1592984.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 98.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05953742936253548, "kl": 0.015821157954633236, "learning_rate": 2.5911111111111115e-06, "loss": 0.0008, "num_tokens": 1593256.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 98.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.3734985291957855, "kl": 0.03603959083557129, "learning_rate": 2.5905555555555554e-06, "loss": 0.0023, "num_tokens": 1593506.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.390059232711792, "kl": 0.04225076828151941, "learning_rate": 2.59e-06, "loss": 0.0528, "num_tokens": 1593802.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.016665315255522728, "kl": 0.011076232563937083, "learning_rate": 2.5894444444444445e-06, "loss": 0.0006, "num_tokens": 1594090.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 98.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02399701252579689, "kl": 0.07955548167228699, "learning_rate": 2.5888888888888893e-06, "loss": 0.004, "num_tokens": 1594455.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 98.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.3747067451477051, "kl": 0.057108381763100624, "learning_rate": 2.5883333333333337e-06, "loss": 0.003, "num_tokens": 1594769.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 98.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04014034569263458, "kl": 0.0036272567231208086, "learning_rate": 2.587777777777778e-06, "loss": 0.0002, "num_tokens": 1595055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5343 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 98.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.8824830055236816, "kl": 0.22206775844097137, "learning_rate": 2.587222222222223e-06, "loss": -0.1426, "num_tokens": 1595380.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 5344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 98.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02610420063138008, "kl": 0.00771152158267796, "learning_rate": 2.5866666666666667e-06, "loss": 0.0004, "num_tokens": 1595652.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 99.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03620046377182007, "kl": 0.0210751099511981, "learning_rate": 2.5861111111111115e-06, "loss": 0.0011, "num_tokens": 1595988.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 99.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.047631740570068, "kl": 0.03569191321730614, "learning_rate": 2.5855555555555555e-06, "loss": 0.0005, "num_tokens": 1596260.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 99.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.1156671866774559, "kl": 0.029874630272388458, "learning_rate": 2.5850000000000002e-06, "loss": 0.0015, "num_tokens": 1596555.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 99.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.5343068838119507, "kl": 0.16894712671637535, "learning_rate": 2.5844444444444446e-06, "loss": 0.1274, "num_tokens": 1596879.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 99.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.288985013961792, "kl": 0.022873520851135254, "learning_rate": 2.583888888888889e-06, "loss": 0.0253, "num_tokens": 1597211.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 99.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.13338780403137207, "kl": 0.025384187698364258, "learning_rate": 2.5833333333333337e-06, "loss": 0.0013, "num_tokens": 1597487.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 99.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.012878862209618092, "kl": 0.23915059864521027, "learning_rate": 2.582777777777778e-06, "loss": 0.0119, "num_tokens": 1597787.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 99.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10618660598993301, "kl": 0.030967731960117817, "learning_rate": 2.5822222222222224e-06, "loss": 0.0016, "num_tokens": 1598117.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 99.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.057999078184366226, "kl": 0.0061847122851759195, "learning_rate": 2.581666666666667e-06, "loss": 0.0003, "num_tokens": 1598393.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 99.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.046694040298461914, "kl": 0.012586678843945265, "learning_rate": 2.5811111111111116e-06, "loss": 0.0007, "num_tokens": 1598663.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 99.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03704658895730972, "kl": 0.03340607322752476, "learning_rate": 2.5805555555555555e-06, "loss": 0.002, "num_tokens": 1598993.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 99.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 5.737944602966309, "kl": 0.05598395876586437, "learning_rate": 2.5800000000000003e-06, "loss": 0.0757, "num_tokens": 1599259.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 5357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 99.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.015402235090732574, "kl": 0.0009844303131103516, "learning_rate": 2.5794444444444442e-06, "loss": 0.0, "num_tokens": 1599471.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 99.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.022058824077248573, "kl": 0.002983078360557556, "learning_rate": 2.578888888888889e-06, "loss": 0.0002, "num_tokens": 1599679.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 99.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.14974507689476013, "kl": 0.04662378504872322, "learning_rate": 2.5783333333333338e-06, "loss": 0.0024, "num_tokens": 1600039.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 99.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.1688950061798096, "kl": 0.3420425280928612, "learning_rate": 2.577777777777778e-06, "loss": 0.1695, "num_tokens": 1600380.0, "reward": 5.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.34165620803833, "step": 5361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 99.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.05445580184459686, "kl": 0.04184877499938011, "learning_rate": 2.5772222222222225e-06, "loss": 0.002, "num_tokens": 1600721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 99.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.16508466005325317, "kl": 0.07202363386750221, "learning_rate": 2.576666666666667e-06, "loss": 0.0039, "num_tokens": 1601071.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 99.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03432736173272133, "kl": 0.0024441094137728214, "learning_rate": 2.5761111111111116e-06, "loss": 0.0001, "num_tokens": 1601325.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 99.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.199028491973877, "kl": 1.5797821953892708, "learning_rate": 2.5755555555555556e-06, "loss": 0.1258, "num_tokens": 1601542.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 99.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02478117123246193, "kl": 0.002216210006736219, "learning_rate": 2.5750000000000003e-06, "loss": 0.0001, "num_tokens": 1601851.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 99.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.2905186712741852, "kl": 0.06250121188350022, "learning_rate": 2.5744444444444443e-06, "loss": 0.0034, "num_tokens": 1602162.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 99.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01654033362865448, "kl": 0.001658409892115742, "learning_rate": 2.573888888888889e-06, "loss": 0.0001, "num_tokens": 1602381.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 99.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007856248994357884, "kl": 5.3510069847106934e-05, "learning_rate": 2.573333333333334e-06, "loss": 0.0, "num_tokens": 1602601.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 99.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.007904558442533016, "kl": 0.005931364372372627, "learning_rate": 2.5727777777777778e-06, "loss": 0.0003, "num_tokens": 1602913.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 99.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05094729736447334, "kl": 0.05151311866939068, "learning_rate": 2.5722222222222225e-06, "loss": 0.0029, "num_tokens": 1603189.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 99.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.0899059772491455, "kl": 0.04105105786584318, "learning_rate": 2.571666666666667e-06, "loss": -0.1259, "num_tokens": 1603470.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 5372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 99.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.007731132209300995, "kl": 0.0011559235863387585, "learning_rate": 2.5711111111111112e-06, "loss": 0.0001, "num_tokens": 1603788.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 99.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00911715067923069, "kl": 0.0009079724550247192, "learning_rate": 2.5705555555555556e-06, "loss": 0.0, "num_tokens": 1604032.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02631578966975212, "clip_ratio/low_min": 0.02631578966975212, "clip_ratio/region_mean": 0.02631578966975212, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 99.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 11.736588478088379, "kl": 0.015324518491979688, "learning_rate": 2.5700000000000004e-06, "loss": 0.1362, "num_tokens": 1604273.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 99.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.044351570308208466, "kl": 0.08527051657438278, "learning_rate": 2.5694444444444443e-06, "loss": 0.0043, "num_tokens": 1604639.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 99.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11416739225387573, "kl": 0.15477371215820312, "learning_rate": 2.568888888888889e-06, "loss": 0.0077, "num_tokens": 1604948.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 99.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.09636739641427994, "kl": 0.01839794684201479, "learning_rate": 2.568333333333334e-06, "loss": 0.0009, "num_tokens": 1605254.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 99.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.791133165359497, "kl": 0.031668925657868385, "learning_rate": 2.567777777777778e-06, "loss": -0.1085, "num_tokens": 1605583.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 99.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05350108817219734, "kl": 0.08827249333262444, "learning_rate": 2.5672222222222226e-06, "loss": 0.0044, "num_tokens": 1605944.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 99.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.007368301972746849, "kl": 0.009272322058677673, "learning_rate": 2.566666666666667e-06, "loss": 0.0005, "num_tokens": 1606180.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 99.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.047177284955978394, "kl": 0.1016039103269577, "learning_rate": 2.5661111111111113e-06, "loss": 0.0051, "num_tokens": 1606531.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 99.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689595878124237, "kl": 0.030433961655944586, "learning_rate": 2.5655555555555557e-06, "loss": 0.0016, "num_tokens": 1606823.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 99.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.009895955212414265, "kl": 0.0009445150790270418, "learning_rate": 2.5650000000000004e-06, "loss": 0.0, "num_tokens": 1607095.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 99.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02423064038157463, "kl": 0.011745330411940813, "learning_rate": 2.5644444444444444e-06, "loss": 0.0006, "num_tokens": 1607355.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 99.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.10493240505456924, "kl": 0.06897155568003654, "learning_rate": 2.563888888888889e-06, "loss": 0.0034, "num_tokens": 1607659.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 99.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.009736581705510616, "kl": 0.0026005889812950045, "learning_rate": 2.563333333333334e-06, "loss": 0.0001, "num_tokens": 1607947.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 99.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.026023002341389656, "kl": 0.034491196274757385, "learning_rate": 2.562777777777778e-06, "loss": 0.0018, "num_tokens": 1608304.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 99.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 2.092515468597412, "kl": 0.024425873532891273, "learning_rate": 2.5622222222222226e-06, "loss": -0.0169, "num_tokens": 1608597.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 99.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.09197484701871872, "kl": 0.0403946191072464, "learning_rate": 2.5616666666666666e-06, "loss": 0.0022, "num_tokens": 1608871.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 99.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05037963390350342, "kl": 0.023382452549412847, "learning_rate": 2.5611111111111113e-06, "loss": 0.0012, "num_tokens": 1609164.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 99.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.061836279928684235, "kl": 0.017133441753685474, "learning_rate": 2.5605555555555557e-06, "loss": 0.0008, "num_tokens": 1609483.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 99.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.008344466798007488, "kl": 0.00028264522552490234, "learning_rate": 2.56e-06, "loss": 0.0, "num_tokens": 1609695.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 99.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.013015818782150745, "kl": 0.0004298865969758481, "learning_rate": 2.5594444444444444e-06, "loss": 0.0, "num_tokens": 1609951.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 99.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.018257036805152893, "kl": 0.002271649893373251, "learning_rate": 2.558888888888889e-06, "loss": 0.0001, "num_tokens": 1610235.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 99.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06050648167729378, "kl": 0.02644370961934328, "learning_rate": 2.558333333333334e-06, "loss": 0.0014, "num_tokens": 1610512.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 99.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009657081100158393, "kl": 0.0016840824391692877, "learning_rate": 2.557777777777778e-06, "loss": 0.0001, "num_tokens": 1610792.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 99.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01226333063095808, "kl": 0.0032669343054294586, "learning_rate": 2.5572222222222227e-06, "loss": 0.0002, "num_tokens": 1611052.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 99.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03040095418691635, "kl": 0.043944044038653374, "learning_rate": 2.5566666666666666e-06, "loss": 0.0022, "num_tokens": 1611498.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 100.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.038719743490219116, "kl": 0.0048247158993035555, "learning_rate": 2.5561111111111114e-06, "loss": 0.0003, "num_tokens": 1611760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 100.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.056214895099401474, "kl": 0.14149929583072662, "learning_rate": 2.5555555555555557e-06, "loss": 0.0071, "num_tokens": 1612071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 100.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.005974870175123215, "kl": 0.08921778574585915, "learning_rate": 2.555e-06, "loss": 0.0045, "num_tokens": 1612435.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 100.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.10339339077472687, "kl": 0.22123929858207703, "learning_rate": 2.5544444444444445e-06, "loss": 0.0103, "num_tokens": 1612760.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 100.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09965769946575165, "kl": 0.0327876191586256, "learning_rate": 2.5538888888888892e-06, "loss": 0.0016, "num_tokens": 1613055.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 100.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.013377469033002853, "kl": 0.00048419239465147257, "learning_rate": 2.5533333333333336e-06, "loss": 0.0, "num_tokens": 1613311.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 100.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05556702986359596, "kl": 0.013316734693944454, "learning_rate": 2.552777777777778e-06, "loss": 0.0007, "num_tokens": 1613640.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 51.75, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 100.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.923677444458008, "kl": 0.14806007593870163, "learning_rate": 2.5522222222222227e-06, "loss": 0.3348, "num_tokens": 1614071.0, "reward": 5.375, "reward_std": 5.25, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 5.25, "step": 5407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 100.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10073071718215942, "kl": 0.013579241465777159, "learning_rate": 2.5516666666666667e-06, "loss": 0.0007, "num_tokens": 1614361.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 100.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.002971409820020199, "kl": 0.00030336977215483785, "learning_rate": 2.5511111111111114e-06, "loss": 0.0, "num_tokens": 1614581.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 100.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0886543020606041, "kl": 0.017868841998279095, "learning_rate": 2.5505555555555554e-06, "loss": 0.0009, "num_tokens": 1614921.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 100.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.0332579612731934, "kl": 0.14925231784582138, "learning_rate": 2.55e-06, "loss": 0.0109, "num_tokens": 1615288.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 100.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008832706953398883, "kl": 0.0016796651761978865, "learning_rate": 2.5494444444444445e-06, "loss": 0.0001, "num_tokens": 1615568.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 100.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.04426419362425804, "kl": 0.010309961158782244, "learning_rate": 2.5488888888888893e-06, "loss": 0.0005, "num_tokens": 1615866.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 100.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.032333724200725555, "kl": 0.0013268657494336367, "learning_rate": 2.5483333333333336e-06, "loss": 0.0001, "num_tokens": 1616128.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 100.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.029041897505521774, "kl": 0.013758502434939146, "learning_rate": 2.547777777777778e-06, "loss": 0.0007, "num_tokens": 1616396.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 100.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.030256763100624084, "kl": 0.0026768818497657776, "learning_rate": 2.5472222222222228e-06, "loss": 0.0001, "num_tokens": 1616604.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 100.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.8436766266822815, "kl": 0.2055099457502365, "learning_rate": 2.5466666666666667e-06, "loss": 0.0103, "num_tokens": 1616828.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 100.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.7729382514953613, "kl": 0.1909218281507492, "learning_rate": 2.5461111111111115e-06, "loss": 0.155, "num_tokens": 1617146.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 5418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 100.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.3739750385284424, "kl": 0.31608064472675323, "learning_rate": 2.5455555555555554e-06, "loss": 0.0477, "num_tokens": 1617448.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 100.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.6765353679656982, "kl": 0.022623569355346262, "learning_rate": 2.545e-06, "loss": 0.2471, "num_tokens": 1617801.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 100.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.011379244737327099, "kl": 0.0017284911591559649, "learning_rate": 2.5444444444444446e-06, "loss": 0.0001, "num_tokens": 1618085.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 100.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.012851860374212265, "kl": 0.0008100643754005432, "learning_rate": 2.543888888888889e-06, "loss": 0.0, "num_tokens": 1618297.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 100.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.199079751968384, "kl": 0.019227453507483006, "learning_rate": 2.5433333333333337e-06, "loss": 0.057, "num_tokens": 1618636.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 100.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.577407121658325, "kl": 0.0223993300460279, "learning_rate": 2.542777777777778e-06, "loss": 0.1089, "num_tokens": 1618941.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 100.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02783801779150963, "kl": 0.0030634899740107358, "learning_rate": 2.5422222222222224e-06, "loss": 0.0002, "num_tokens": 1619213.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 100.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.5949885845184326, "kl": 0.038429130567237735, "learning_rate": 2.5416666666666668e-06, "loss": 0.1914, "num_tokens": 1619547.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 100.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.338369369506836, "kl": 0.03545006737112999, "learning_rate": 2.5411111111111115e-06, "loss": 0.182, "num_tokens": 1619929.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 100.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.052452344447374344, "kl": 0.016229163389652967, "learning_rate": 2.5405555555555555e-06, "loss": 0.0008, "num_tokens": 1620217.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 100.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.2920658588409424, "kl": 0.012143343687057495, "learning_rate": 2.5400000000000002e-06, "loss": 0.0148, "num_tokens": 1620530.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 5429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 100.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.031286004930734634, "kl": 0.003409292781725526, "learning_rate": 2.539444444444444e-06, "loss": 0.0002, "num_tokens": 1620792.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 100.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.17051242291927338, "kl": 0.04419940384104848, "learning_rate": 2.538888888888889e-06, "loss": 0.0022, "num_tokens": 1621083.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 100.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.7341890335083008, "kl": 0.0438017500564456, "learning_rate": 2.5383333333333337e-06, "loss": -0.0019, "num_tokens": 1621382.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 100.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.3417859077453613, "kl": 0.05940388049930334, "learning_rate": 2.537777777777778e-06, "loss": -0.1245, "num_tokens": 1621729.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 5433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 100.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.010019652545452118, "kl": 0.001199198653921485, "learning_rate": 2.5372222222222224e-06, "loss": 0.0001, "num_tokens": 1622009.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 100.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.4564242362976074, "kl": 0.15757225453853607, "learning_rate": 2.536666666666667e-06, "loss": 0.0153, "num_tokens": 1622318.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 5435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 100.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.005262145306915045, "kl": 0.0004959943616995588, "learning_rate": 2.5361111111111116e-06, "loss": 0.0, "num_tokens": 1622552.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 100.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.173781156539917, "kl": 0.06994973868131638, "learning_rate": 2.5355555555555555e-06, "loss": 0.0387, "num_tokens": 1622950.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 5437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 100.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.02372923493385315, "kl": 0.005331070693500806, "learning_rate": 2.5350000000000003e-06, "loss": 0.0003, "num_tokens": 1623216.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 100.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.12291641533374786, "kl": 0.020807057269848883, "learning_rate": 2.534444444444445e-06, "loss": 0.0014, "num_tokens": 1623488.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 100.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.052744925022125244, "kl": 0.013409722596406937, "learning_rate": 2.533888888888889e-06, "loss": 0.0007, "num_tokens": 1623761.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 100.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.009191208519041538, "kl": 0.0009931214153766632, "learning_rate": 2.5333333333333338e-06, "loss": 0.0, "num_tokens": 1624005.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 100.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.769653797149658, "kl": 0.08456568047404289, "learning_rate": 2.5327777777777777e-06, "loss": 0.0078, "num_tokens": 1624315.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 5442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 100.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.007472280878573656, "kl": 0.009235680103302002, "learning_rate": 2.5322222222222225e-06, "loss": 0.0005, "num_tokens": 1624551.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 100.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.1336943656206131, "kl": 0.07702604681253433, "learning_rate": 2.531666666666667e-06, "loss": 0.0037, "num_tokens": 1625001.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 100.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.008520612493157387, "kl": 0.00029546022415161133, "learning_rate": 2.531111111111111e-06, "loss": 0.0, "num_tokens": 1625213.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 100.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014496175572276115, "kl": 0.00011897832155227661, "learning_rate": 2.5305555555555556e-06, "loss": 0.0, "num_tokens": 1625433.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 100.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0201893113553524, "kl": 0.01283263461664319, "learning_rate": 2.5300000000000003e-06, "loss": 0.0006, "num_tokens": 1625693.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 100.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.005522768013179302, "kl": 0.003309585154056549, "learning_rate": 2.529444444444445e-06, "loss": 0.0002, "num_tokens": 1625953.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 100.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.08543960750102997, "kl": 0.023385632317513227, "learning_rate": 2.528888888888889e-06, "loss": 0.0012, "num_tokens": 1626270.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 100.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07655711472034454, "kl": 0.023105978965759277, "learning_rate": 2.528333333333334e-06, "loss": 0.0012, "num_tokens": 1626570.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 100.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.1689209938049316, "kl": 0.1361178159713745, "learning_rate": 2.5277777777777778e-06, "loss": 0.1804, "num_tokens": 1626935.0, "reward": 4.25, "reward_std": 4.092676162719727, "rewards/reward_combined/mean": 4.25, "rewards/reward_combined/std": 4.092676162719727, "step": 5451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 100.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.199589252471924, "kl": 0.2203235775232315, "learning_rate": 2.5272222222222225e-06, "loss": 0.1536, "num_tokens": 1627288.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 5452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 100.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.571195602416992, "kl": 0.09436308592557907, "learning_rate": 2.526666666666667e-06, "loss": -0.1128, "num_tokens": 1627566.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06035804748535156, "kl": 0.02078956738114357, "learning_rate": 2.5261111111111113e-06, "loss": 0.001, "num_tokens": 1627833.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 101.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.750530242919922, "kl": 0.03117153514176607, "learning_rate": 2.5255555555555556e-06, "loss": 0.071, "num_tokens": 1628178.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.060231804847717285, "kl": 0.008325956761837006, "learning_rate": 2.5250000000000004e-06, "loss": 0.0004, "num_tokens": 1628444.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 101.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.7280755043029785, "kl": 0.11941811442375183, "learning_rate": 2.5244444444444447e-06, "loss": 0.2696, "num_tokens": 1628836.0, "reward": 4.875, "reward_std": 3.1983067989349365, "rewards/reward_combined/mean": 4.875, "rewards/reward_combined/std": 3.1983067989349365, "step": 5457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.466806888580322, "kl": 0.5005347430706024, "learning_rate": 2.523888888888889e-06, "loss": -0.1007, "num_tokens": 1629136.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 101.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.9583325386047363, "kl": 0.04262696020305157, "learning_rate": 2.523333333333334e-06, "loss": 0.1971, "num_tokens": 1629462.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.034670230001211166, "kl": 0.005280144279822707, "learning_rate": 2.522777777777778e-06, "loss": 0.0002, "num_tokens": 1629760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 101.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.005305152852088213, "kl": 0.0001323223114013672, "learning_rate": 2.5222222222222226e-06, "loss": 0.0, "num_tokens": 1629972.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 101.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08880160748958588, "kl": 0.009628163650631905, "learning_rate": 2.5216666666666665e-06, "loss": 0.0005, "num_tokens": 1630185.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634235069155693, "kl": 0.02626852784305811, "learning_rate": 2.5211111111111113e-06, "loss": 0.0014, "num_tokens": 1630477.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 101.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 1.9029589891433716, "kl": 0.1794801875948906, "learning_rate": 2.5205555555555557e-06, "loss": -0.043, "num_tokens": 1630856.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 5464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 101.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06879772990942001, "kl": 0.05017814412713051, "learning_rate": 2.52e-06, "loss": 0.0028, "num_tokens": 1631130.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 101.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.007824460975825787, "kl": 0.002103174803778529, "learning_rate": 2.519444444444445e-06, "loss": 0.0001, "num_tokens": 1631442.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 101.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.036902882158756256, "kl": 0.011481436900794506, "learning_rate": 2.518888888888889e-06, "loss": 0.0006, "num_tokens": 1631775.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 101.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 3.2547268867492676, "kl": 0.010134945099707693, "learning_rate": 2.518333333333334e-06, "loss": 0.1656, "num_tokens": 1632050.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 101.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.009275670163333416, "kl": 0.0007830964750610292, "learning_rate": 2.517777777777778e-06, "loss": 0.0, "num_tokens": 1632285.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 101.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 2.3322365283966064, "kl": 0.03133982606232166, "learning_rate": 2.5172222222222226e-06, "loss": -0.1838, "num_tokens": 1632644.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 5470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 101.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03074675239622593, "kl": 0.05015807785093784, "learning_rate": 2.5166666666666666e-06, "loss": 0.0025, "num_tokens": 1633104.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 101.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.8449034690856934, "kl": 0.028042261488735676, "learning_rate": 2.5161111111111113e-06, "loss": 0.1107, "num_tokens": 1633432.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 101.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.001317931804805994, "kl": 0.00010580569505691528, "learning_rate": 2.5155555555555557e-06, "loss": 0.0, "num_tokens": 1633652.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 101.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02028479054570198, "kl": 0.001700088381767273, "learning_rate": 2.515e-06, "loss": 0.0001, "num_tokens": 1633858.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06535637378692627, "kl": 0.013758102431893349, "learning_rate": 2.514444444444445e-06, "loss": 0.0007, "num_tokens": 1634134.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 5.143098831176758, "kl": 0.07049257913604379, "learning_rate": 2.513888888888889e-06, "loss": 0.1373, "num_tokens": 1634404.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 101.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.005218098405748606, "kl": 0.08949637413024902, "learning_rate": 2.5133333333333336e-06, "loss": 0.0045, "num_tokens": 1634768.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 101.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.014540195465088, "kl": 0.005055401416029781, "learning_rate": 2.512777777777778e-06, "loss": -0.0384, "num_tokens": 1635050.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 5478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 101.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.16442911326885223, "kl": 0.02644335851073265, "learning_rate": 2.5122222222222227e-06, "loss": 0.0013, "num_tokens": 1635347.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006849315017461777, "clip_ratio/low_min": 0.006849315017461777, "clip_ratio/region_mean": 0.006849315017461777, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 101.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.8768398761749268, "kl": 0.19516758620738983, "learning_rate": 2.5116666666666666e-06, "loss": 0.1356, "num_tokens": 1635702.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02544652484357357, "kl": 0.0005195349513087422, "learning_rate": 2.5111111111111114e-06, "loss": 0.0, "num_tokens": 1635958.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 101.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.043915845453739166, "kl": 0.00688141216232907, "learning_rate": 2.5105555555555553e-06, "loss": 0.0004, "num_tokens": 1636256.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 101.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.3879215717315674, "kl": 0.05715668946504593, "learning_rate": 2.51e-06, "loss": 0.0036, "num_tokens": 1636483.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 101.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.06345385313034058, "kl": 0.1716216653585434, "learning_rate": 2.509444444444445e-06, "loss": 0.0086, "num_tokens": 1636794.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 101.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.01337228063493967, "kl": 0.006043243454769254, "learning_rate": 2.5088888888888892e-06, "loss": 0.0003, "num_tokens": 1637068.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 101.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.007353183813393116, "kl": 0.00934215635061264, "learning_rate": 2.5083333333333336e-06, "loss": 0.0005, "num_tokens": 1637304.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 101.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.013110511004924774, "kl": 0.01483762264251709, "learning_rate": 2.507777777777778e-06, "loss": 0.0007, "num_tokens": 1637564.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 101.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004285744798835367, "kl": 0.0017739430186338723, "learning_rate": 2.5072222222222227e-06, "loss": 0.0001, "num_tokens": 1637841.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009999999776482582, "clip_ratio/low_min": 0.009999999776482582, "clip_ratio/region_mean": 0.009999999776482582, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 101.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.209275245666504, "kl": 0.2331688404083252, "learning_rate": 2.5066666666666667e-06, "loss": -0.0694, "num_tokens": 1638156.0, "reward": 5.625, "reward_std": 4.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 4.75, "step": 5489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 101.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.1121700331568718, "kl": 0.03079107915982604, "learning_rate": 2.5061111111111114e-06, "loss": 0.0015, "num_tokens": 1638448.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 101.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05376268923282623, "kl": 0.005227272631600499, "learning_rate": 2.5055555555555554e-06, "loss": 0.0003, "num_tokens": 1638717.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 101.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 8.100409507751465, "kl": 1.433023601770401, "learning_rate": 2.505e-06, "loss": -0.0689, "num_tokens": 1639018.0, "reward": 6.625, "reward_std": 2.428133726119995, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.428133726119995, "step": 5492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 101.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.00842171162366867, "kl": 0.0033253468573093414, "learning_rate": 2.504444444444445e-06, "loss": 0.0002, "num_tokens": 1639278.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 101.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03290287032723427, "kl": 0.25652939826250076, "learning_rate": 2.503888888888889e-06, "loss": 0.0128, "num_tokens": 1639576.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.025511542335152626, "kl": 0.006476711947470903, "learning_rate": 2.5033333333333336e-06, "loss": 0.0003, "num_tokens": 1639872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 101.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.011643318459391594, "kl": 0.0009425669923075475, "learning_rate": 2.502777777777778e-06, "loss": 0.0, "num_tokens": 1640132.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 101.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.017525918781757355, "kl": 0.0031189483124762774, "learning_rate": 2.5022222222222224e-06, "loss": 0.0002, "num_tokens": 1640416.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 101.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.12592151761054993, "kl": 0.027514537796378136, "learning_rate": 2.5016666666666667e-06, "loss": 0.0014, "num_tokens": 1640686.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 101.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.11585982888936996, "kl": 0.02882299106568098, "learning_rate": 2.5011111111111115e-06, "loss": 0.0014, "num_tokens": 1641019.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 101.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.006333121098577976, "kl": 0.0007497221231460571, "learning_rate": 2.5005555555555554e-06, "loss": 0.0, "num_tokens": 1641263.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 101.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.015307389199733734, "kl": 0.0035783765197265893, "learning_rate": 2.5e-06, "loss": 0.0002, "num_tokens": 1641553.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 101.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.015352359972894192, "kl": 0.009041615761816502, "learning_rate": 2.4994444444444446e-06, "loss": 0.0005, "num_tokens": 1641865.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 101.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.011295316740870476, "kl": 0.001469508744776249, "learning_rate": 2.498888888888889e-06, "loss": 0.0001, "num_tokens": 1642190.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 101.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02664388343691826, "kl": 0.07650396600365639, "learning_rate": 2.4983333333333333e-06, "loss": 0.0038, "num_tokens": 1642555.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 101.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03405565768480301, "kl": 0.025526640005409718, "learning_rate": 2.497777777777778e-06, "loss": 0.0013, "num_tokens": 1642883.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 101.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.061645783483982086, "kl": 0.005348893988411874, "learning_rate": 2.4972222222222224e-06, "loss": 0.0002, "num_tokens": 1643102.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5506 }, { "clip_ratio/high_max": 0.011627906933426857, "clip_ratio/high_mean": 0.011627906933426857, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011627906933426857, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 101.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.959026575088501, "kl": 0.09452280029654503, "learning_rate": 2.4966666666666668e-06, "loss": -0.0768, "num_tokens": 1643411.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 102.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.048355814069509506, "kl": 0.0027684043161571026, "learning_rate": 2.4961111111111115e-06, "loss": 0.0001, "num_tokens": 1643681.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 102.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.579277992248535, "kl": 0.024515529163181782, "learning_rate": 2.495555555555556e-06, "loss": 0.0073, "num_tokens": 1643947.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 102.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.5671530961990356, "kl": 0.05231703631579876, "learning_rate": 2.4950000000000003e-06, "loss": -0.0437, "num_tokens": 1644310.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 102.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.08306485414505005, "kl": 0.009852750925347209, "learning_rate": 2.4944444444444446e-06, "loss": 0.0006, "num_tokens": 1644586.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 102.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.011461027897894382, "kl": 0.0015552272088825703, "learning_rate": 2.493888888888889e-06, "loss": 0.0001, "num_tokens": 1644904.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 102.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.008668001741170883, "kl": 0.07923033088445663, "learning_rate": 2.4933333333333333e-06, "loss": 0.0039, "num_tokens": 1645275.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 102.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029924819245934486, "kl": 0.0005078613758087158, "learning_rate": 2.4927777777777777e-06, "loss": 0.0, "num_tokens": 1645495.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 102.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.4482656419277191, "kl": 0.2195238471031189, "learning_rate": 2.4922222222222225e-06, "loss": 0.011, "num_tokens": 1645804.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 102.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.710933685302734, "kl": 0.01484463270753622, "learning_rate": 2.491666666666667e-06, "loss": 0.165, "num_tokens": 1646074.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 5516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 102.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.019514666870236397, "kl": 0.005388738354668021, "learning_rate": 2.491111111111111e-06, "loss": 0.0003, "num_tokens": 1646336.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 102.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.028111839666962624, "kl": 0.07157425582408905, "learning_rate": 2.490555555555556e-06, "loss": 0.0036, "num_tokens": 1646701.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 102.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 5.647228717803955, "kl": 0.04666347615420818, "learning_rate": 2.4900000000000003e-06, "loss": -0.0713, "num_tokens": 1646981.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 5519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 102.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.9130663871765137, "kl": 0.013536634505726397, "learning_rate": 2.4894444444444447e-06, "loss": 0.2435, "num_tokens": 1647298.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 102.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009362853597849607, "kl": 0.0018524638144299388, "learning_rate": 2.488888888888889e-06, "loss": 0.0001, "num_tokens": 1647572.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 102.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.19461257755756378, "kl": 0.05732526257634163, "learning_rate": 2.4883333333333334e-06, "loss": 0.003, "num_tokens": 1647931.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 102.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07578166574239731, "kl": 0.00980814453214407, "learning_rate": 2.4877777777777777e-06, "loss": 0.0005, "num_tokens": 1648203.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 102.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.021403957158327103, "kl": 0.005372380139306188, "learning_rate": 2.4872222222222225e-06, "loss": 0.0003, "num_tokens": 1648537.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 102.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.023872531950473785, "kl": 0.04521903581917286, "learning_rate": 2.486666666666667e-06, "loss": 0.0023, "num_tokens": 1649005.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 102.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.11882637441158295, "kl": 0.009022568352520466, "learning_rate": 2.4861111111111112e-06, "loss": 0.0007, "num_tokens": 1649221.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 102.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0520920604467392, "kl": 0.05707976594567299, "learning_rate": 2.4855555555555556e-06, "loss": 0.0031, "num_tokens": 1649497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 102.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.024259457364678383, "kl": 0.019943345338106155, "learning_rate": 2.4850000000000003e-06, "loss": 0.001, "num_tokens": 1649822.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 102.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.009214324876666069, "kl": 0.0009160339832305908, "learning_rate": 2.4844444444444447e-06, "loss": 0.0, "num_tokens": 1650066.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 102.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.1228889673948288, "kl": 0.004293181002140045, "learning_rate": 2.483888888888889e-06, "loss": 0.0003, "num_tokens": 1650272.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 102.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.004703023005276918, "kl": 0.0006488128929049708, "learning_rate": 2.4833333333333334e-06, "loss": 0.0, "num_tokens": 1650506.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 102.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.3811802864074707, "kl": 0.2844196930527687, "learning_rate": 2.4827777777777778e-06, "loss": 0.026, "num_tokens": 1650841.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 102.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.10337254405021667, "kl": 0.019540791399776936, "learning_rate": 2.4822222222222225e-06, "loss": 0.001, "num_tokens": 1651107.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 102.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.019042188301682472, "kl": 0.029911242425441742, "learning_rate": 2.481666666666667e-06, "loss": 0.0015, "num_tokens": 1651323.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 102.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.18919405341148376, "kl": 0.06094091758131981, "learning_rate": 2.4811111111111113e-06, "loss": 0.003, "num_tokens": 1651655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 102.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00888847466558218, "kl": 0.0007103905081748962, "learning_rate": 2.4805555555555556e-06, "loss": 0.0, "num_tokens": 1651927.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 102.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.4618794918060303, "kl": 0.1834569089114666, "learning_rate": 2.4800000000000004e-06, "loss": -0.0475, "num_tokens": 1652261.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 102.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.12396698445081711, "kl": 0.04682730697095394, "learning_rate": 2.4794444444444448e-06, "loss": 0.0026, "num_tokens": 1652542.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 102.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.02951684221625328, "kl": 0.007103677839040756, "learning_rate": 2.478888888888889e-06, "loss": 0.0004, "num_tokens": 1652843.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 102.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04245077818632126, "kl": 0.0247166333720088, "learning_rate": 2.4783333333333335e-06, "loss": 0.0013, "num_tokens": 1653132.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 102.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.10434053838253021, "kl": 0.049305928871035576, "learning_rate": 2.4777777777777782e-06, "loss": 0.0025, "num_tokens": 1653424.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 102.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.2755324840545654, "kl": 0.1715521290898323, "learning_rate": 2.4772222222222226e-06, "loss": 0.0038, "num_tokens": 1653733.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 102.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005337389884516597, "kl": 3.3058226108551025e-05, "learning_rate": 2.476666666666667e-06, "loss": 0.0, "num_tokens": 1653953.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 102.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.006887510884553194, "kl": 0.009455777704715729, "learning_rate": 2.4761111111111113e-06, "loss": 0.0005, "num_tokens": 1654189.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 102.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.3441609740257263, "kl": 0.05767953675240278, "learning_rate": 2.4755555555555557e-06, "loss": 0.0031, "num_tokens": 1654475.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 102.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.01588129624724388, "kl": 0.0002623820837470703, "learning_rate": 2.475e-06, "loss": 0.0, "num_tokens": 1654732.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 102.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03978109359741211, "kl": 0.06323739141225815, "learning_rate": 2.474444444444445e-06, "loss": 0.0032, "num_tokens": 1655053.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 102.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02109816111624241, "kl": 0.0031183988321572542, "learning_rate": 2.473888888888889e-06, "loss": 0.0002, "num_tokens": 1655309.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008771929889917374, "clip_ratio/low_min": 0.008771929889917374, "clip_ratio/region_mean": 0.008771929889917374, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 102.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 4.121896743774414, "kl": 0.10315002501010895, "learning_rate": 2.4733333333333335e-06, "loss": 0.1558, "num_tokens": 1655671.0, "reward": 5.875, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.5910770893096924, "step": 5549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 102.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 2.8119122982025146, "kl": 0.717215521261096, "learning_rate": 2.4727777777777783e-06, "loss": 0.0338, "num_tokens": 1655930.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 102.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 0.9222787022590637, "kl": 0.4428175347857177, "learning_rate": 2.4722222222222226e-06, "loss": 0.0238, "num_tokens": 1656218.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 102.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.04094575718045235, "kl": 0.0007905960083007812, "learning_rate": 2.471666666666667e-06, "loss": 0.0, "num_tokens": 1656430.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 102.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.009254053235054016, "kl": 0.0037779523408971727, "learning_rate": 2.4711111111111114e-06, "loss": 0.0002, "num_tokens": 1656734.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 102.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08732199668884277, "kl": 0.04086933843791485, "learning_rate": 2.4705555555555557e-06, "loss": 0.0022, "num_tokens": 1657109.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 102.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.059847328811883926, "kl": 0.07562807947397232, "learning_rate": 2.47e-06, "loss": 0.0038, "num_tokens": 1657411.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 102.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02710452675819397, "kl": 0.01615491695702076, "learning_rate": 2.4694444444444444e-06, "loss": 0.0008, "num_tokens": 1657727.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 102.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.006834486965090036, "kl": 0.002601444721221924, "learning_rate": 2.468888888888889e-06, "loss": 0.0001, "num_tokens": 1658039.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 102.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.354923963546753, "kl": 0.23883940279483795, "learning_rate": 2.4683333333333336e-06, "loss": -0.0702, "num_tokens": 1658352.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 102.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.015346618369221687, "kl": 0.008940006606280804, "learning_rate": 2.467777777777778e-06, "loss": 0.0004, "num_tokens": 1658664.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 102.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.021059589460492134, "kl": 0.0036442987620830536, "learning_rate": 2.4672222222222227e-06, "loss": 0.0002, "num_tokens": 1658924.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 102.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.029556775465607643, "kl": 0.2288762778043747, "learning_rate": 2.466666666666667e-06, "loss": 0.0114, "num_tokens": 1659226.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 103.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.298452854156494, "kl": 0.035748597234487534, "learning_rate": 2.4661111111111114e-06, "loss": 0.0375, "num_tokens": 1659525.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 103.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.019846562296152115, "kl": 0.011849933303892612, "learning_rate": 2.4655555555555558e-06, "loss": 0.0006, "num_tokens": 1659785.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 103.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.6404832601547241, "kl": 0.08095047622919083, "learning_rate": 2.465e-06, "loss": 0.0329, "num_tokens": 1660153.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 103.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.029613761231303215, "kl": 0.016488718800246716, "learning_rate": 2.4644444444444445e-06, "loss": 0.0008, "num_tokens": 1660469.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 103.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.24343974888324738, "kl": 0.04121161811053753, "learning_rate": 2.463888888888889e-06, "loss": 0.002, "num_tokens": 1660821.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 103.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.011605004779994488, "kl": 0.008811983279883862, "learning_rate": 2.4633333333333336e-06, "loss": 0.0004, "num_tokens": 1661133.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 103.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.4937708377838135, "kl": 0.11116837710142136, "learning_rate": 2.462777777777778e-06, "loss": -0.1126, "num_tokens": 1661408.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 5568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 103.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.022699542343616486, "kl": 0.006970549002289772, "learning_rate": 2.4622222222222223e-06, "loss": 0.0003, "num_tokens": 1661680.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0707366093993187, "kl": 0.0181250162422657, "learning_rate": 2.461666666666667e-06, "loss": 0.001, "num_tokens": 1661968.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 103.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.877955436706543, "kl": 0.10943334549665451, "learning_rate": 2.4611111111111115e-06, "loss": 0.0071, "num_tokens": 1662346.0, "reward": 4.375, "reward_std": 4.049177169799805, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.049177169799805, "step": 5571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 103.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.047392647713422775, "kl": 0.02639351785182953, "learning_rate": 2.460555555555556e-06, "loss": 0.0013, "num_tokens": 1662674.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 103.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.008066931739449501, "kl": 0.00910438597202301, "learning_rate": 2.46e-06, "loss": 0.0005, "num_tokens": 1662910.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 103.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01666996255517006, "kl": 0.17074329406023026, "learning_rate": 2.4594444444444445e-06, "loss": 0.0085, "num_tokens": 1663218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 103.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.7252640128135681, "kl": 0.11332177836447954, "learning_rate": 2.458888888888889e-06, "loss": 0.0073, "num_tokens": 1663497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 103.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.01185721717774868, "kl": 0.0034667067229747772, "learning_rate": 2.4583333333333332e-06, "loss": 0.0002, "num_tokens": 1663757.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 103.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 1.5202884674072266, "kl": 0.2500573927536607, "learning_rate": 2.457777777777778e-06, "loss": 0.0112, "num_tokens": 1664059.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 103.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.09597038477659225, "kl": 0.00747075816616416, "learning_rate": 2.4572222222222224e-06, "loss": 0.0004, "num_tokens": 1664270.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 103.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.009338652715086937, "kl": 0.0009076967835426331, "learning_rate": 2.4566666666666667e-06, "loss": 0.0, "num_tokens": 1664514.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.08981441706418991, "kl": 0.020825155545026064, "learning_rate": 2.4561111111111115e-06, "loss": 0.0011, "num_tokens": 1664793.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 103.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.030128460377454758, "kl": 0.002772117732092738, "learning_rate": 2.455555555555556e-06, "loss": 0.0001, "num_tokens": 1665053.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 103.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04940595477819443, "kl": 0.0030954480171203613, "learning_rate": 2.4550000000000002e-06, "loss": 0.0002, "num_tokens": 1665265.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 103.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.0732994079589844, "kl": 0.031237422736012377, "learning_rate": 2.4544444444444446e-06, "loss": 0.0336, "num_tokens": 1665575.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.013997764326632023, "kl": 0.003972329664975405, "learning_rate": 2.453888888888889e-06, "loss": 0.0002, "num_tokens": 1665879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 103.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.005151594989001751, "kl": 0.0894521176815033, "learning_rate": 2.4533333333333333e-06, "loss": 0.0045, "num_tokens": 1666243.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 103.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.6718776822090149, "kl": 0.26932719349861145, "learning_rate": 2.452777777777778e-06, "loss": 0.0119, "num_tokens": 1666569.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 103.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006377811660058796, "kl": 4.258006811141968e-05, "learning_rate": 2.4522222222222224e-06, "loss": 0.0, "num_tokens": 1666789.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 103.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08769865334033966, "kl": 0.010329136159271002, "learning_rate": 2.4516666666666668e-06, "loss": 0.0005, "num_tokens": 1667071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 103.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.4633828103542328, "kl": 0.12542473152279854, "learning_rate": 2.451111111111111e-06, "loss": 0.0063, "num_tokens": 1667374.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 103.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.5782623291015625, "kl": 0.019798667170107365, "learning_rate": 2.450555555555556e-06, "loss": -0.0136, "num_tokens": 1667668.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 103.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04883423447608948, "kl": 0.00783828180283308, "learning_rate": 2.4500000000000003e-06, "loss": 0.0004, "num_tokens": 1667931.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 103.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009018346318043768, "kl": 0.001695032638963312, "learning_rate": 2.4494444444444446e-06, "loss": 0.0001, "num_tokens": 1668211.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 103.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0366603285074234, "kl": 0.008011693600565195, "learning_rate": 2.448888888888889e-06, "loss": 0.0005, "num_tokens": 1668531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 103.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029949501622468233, "kl": 0.0005628645594697446, "learning_rate": 2.4483333333333333e-06, "loss": 0.0, "num_tokens": 1668751.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 103.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.388144314289093, "kl": 0.11898202449083328, "learning_rate": 2.447777777777778e-06, "loss": 0.0061, "num_tokens": 1669085.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 103.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.021832484751939774, "kl": 0.24135616421699524, "learning_rate": 2.4472222222222225e-06, "loss": 0.0121, "num_tokens": 1669385.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 103.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00807621143758297, "kl": 0.00035766404471360147, "learning_rate": 2.446666666666667e-06, "loss": 0.0, "num_tokens": 1669619.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 103.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.2557663023471832, "kl": 0.17843373864889145, "learning_rate": 2.446111111111111e-06, "loss": 0.0089, "num_tokens": 1669958.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 103.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.030201023444533348, "kl": 0.05201954022049904, "learning_rate": 2.4455555555555555e-06, "loss": 0.0024, "num_tokens": 1670289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 103.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03715500235557556, "kl": 0.0014945015136618167, "learning_rate": 2.4450000000000003e-06, "loss": 0.0, "num_tokens": 1670537.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 103.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.028662055730819702, "kl": 0.00811052555218339, "learning_rate": 2.4444444444444447e-06, "loss": 0.0004, "num_tokens": 1670871.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 103.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.0467538833618164, "kl": 0.03977851942181587, "learning_rate": 2.443888888888889e-06, "loss": 0.1025, "num_tokens": 1671231.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 103.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07721295952796936, "kl": 0.03680736757814884, "learning_rate": 2.443333333333334e-06, "loss": 0.0018, "num_tokens": 1671564.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 103.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02043692208826542, "kl": 0.004586476366966963, "learning_rate": 2.442777777777778e-06, "loss": 0.0002, "num_tokens": 1671826.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.02249213494360447, "kl": 0.0017384563689120114, "learning_rate": 2.4422222222222225e-06, "loss": 0.0001, "num_tokens": 1672097.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 103.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.009372985921800137, "kl": 0.0007195473008323461, "learning_rate": 2.441666666666667e-06, "loss": 0.0, "num_tokens": 1672369.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 103.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.09583240002393723, "kl": 0.029260587878525257, "learning_rate": 2.4411111111111112e-06, "loss": 0.0015, "num_tokens": 1672660.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 103.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.013318100944161415, "kl": 0.00029803812503814697, "learning_rate": 2.4405555555555556e-06, "loss": 0.0, "num_tokens": 1672916.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 103.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05976519733667374, "kl": 0.00900553585961461, "learning_rate": 2.4400000000000004e-06, "loss": 0.0005, "num_tokens": 1673204.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.14001913368701935, "kl": 0.03948806179687381, "learning_rate": 2.4394444444444447e-06, "loss": 0.0022, "num_tokens": 1673487.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 103.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05701345205307007, "kl": 0.003511556889861822, "learning_rate": 2.438888888888889e-06, "loss": 0.0002, "num_tokens": 1673700.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 103.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.138713359832764, "kl": 0.16540008783340454, "learning_rate": 2.438333333333334e-06, "loss": 0.1073, "num_tokens": 1674000.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 5612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 103.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.15293893218040466, "kl": 0.040566062554717064, "learning_rate": 2.437777777777778e-06, "loss": 0.0024, "num_tokens": 1674235.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 103.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09554877877235413, "kl": 0.051612548530101776, "learning_rate": 2.4372222222222226e-06, "loss": 0.0025, "num_tokens": 1674715.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 103.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.029680676758289337, "kl": 0.003830839297734201, "learning_rate": 2.436666666666667e-06, "loss": 0.0002, "num_tokens": 1675041.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 92.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 92.5, "completions/mean_terminated_length": 38.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 104.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.565948247909546, "kl": 0.0516146719455719, "learning_rate": 2.4361111111111113e-06, "loss": 0.3959, "num_tokens": 1675635.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 104.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.005805522669106722, "kl": 0.0018732735188677907, "learning_rate": 2.4355555555555556e-06, "loss": 0.0001, "num_tokens": 1675912.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 104.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.010314513929188251, "kl": 0.0018368021119385958, "learning_rate": 2.435e-06, "loss": 0.0001, "num_tokens": 1676170.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 104.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 4.932293891906738, "kl": 0.02347030080272816, "learning_rate": 2.4344444444444448e-06, "loss": 0.1939, "num_tokens": 1676443.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 5619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 104.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.268324613571167, "kl": 0.09545281901955605, "learning_rate": 2.433888888888889e-06, "loss": 0.122, "num_tokens": 1676793.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 104.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.0760228633880615, "kl": 0.038268379867076874, "learning_rate": 2.4333333333333335e-06, "loss": 0.0249, "num_tokens": 1677068.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 5621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 104.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.028653642162680626, "kl": 0.07797255739569664, "learning_rate": 2.4327777777777782e-06, "loss": 0.0039, "num_tokens": 1677354.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 104.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.3906779289245605, "kl": 0.045016048941761255, "learning_rate": 2.4322222222222226e-06, "loss": -0.0009, "num_tokens": 1677573.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 104.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07186689972877502, "kl": 0.01569122076034546, "learning_rate": 2.431666666666667e-06, "loss": 0.0008, "num_tokens": 1677854.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 104.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.09095453470945358, "kl": 0.015211077407002449, "learning_rate": 2.4311111111111113e-06, "loss": 0.001, "num_tokens": 1678142.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 104.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.04219279810786247, "kl": 0.004902251064777374, "learning_rate": 2.4305555555555557e-06, "loss": 0.0002, "num_tokens": 1678350.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 104.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06957779079675674, "kl": 0.017947870772331953, "learning_rate": 2.43e-06, "loss": 0.0009, "num_tokens": 1678642.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 104.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.009139263071119785, "kl": 0.008841581642627716, "learning_rate": 2.4294444444444444e-06, "loss": 0.0004, "num_tokens": 1678878.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 104.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03854931890964508, "kl": 0.0005135834217071533, "learning_rate": 2.428888888888889e-06, "loss": 0.0, "num_tokens": 1679090.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 104.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 3.7237935066223145, "kl": 0.25723934173583984, "learning_rate": 2.4283333333333335e-06, "loss": 0.0555, "num_tokens": 1679405.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 5630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 104.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.023261865600943565, "kl": 0.004189026076346636, "learning_rate": 2.427777777777778e-06, "loss": 0.0002, "num_tokens": 1679705.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 104.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.02625439502298832, "kl": 0.009117868263274431, "learning_rate": 2.4272222222222227e-06, "loss": 0.0005, "num_tokens": 1679965.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 104.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.027309391647577286, "kl": 0.001595887792063877, "learning_rate": 2.426666666666667e-06, "loss": 0.0001, "num_tokens": 1680235.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 96.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 96.25, "completions/mean_terminated_length": 43.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 104.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.1959805488586426, "kl": 0.11889890301972628, "learning_rate": 2.4261111111111114e-06, "loss": 0.1459, "num_tokens": 1680840.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 5634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 104.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08674833178520203, "kl": 0.010245629120618105, "learning_rate": 2.4255555555555557e-06, "loss": 0.0005, "num_tokens": 1681110.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 104.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.004816879518330097, "kl": 0.0033306367695331573, "learning_rate": 2.425e-06, "loss": 0.0002, "num_tokens": 1681370.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 104.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.09180159121751785, "kl": 0.009643346536904573, "learning_rate": 2.4244444444444444e-06, "loss": 0.0005, "num_tokens": 1681686.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 104.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.025947626680135727, "kl": 0.005060877650976181, "learning_rate": 2.4238888888888888e-06, "loss": 0.0003, "num_tokens": 1682016.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 104.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08308103680610657, "kl": 0.026025786995887756, "learning_rate": 2.4233333333333336e-06, "loss": 0.0013, "num_tokens": 1682302.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.5, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 104.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.08720619231462479, "kl": 0.04662519320845604, "learning_rate": 2.422777777777778e-06, "loss": 0.0023, "num_tokens": 1682784.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 104.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.6798386573791504, "kl": 0.14887593686580658, "learning_rate": 2.4222222222222223e-06, "loss": 0.008, "num_tokens": 1683099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 104.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 7.297842025756836, "kl": 0.061099687591195107, "learning_rate": 2.421666666666667e-06, "loss": 0.0744, "num_tokens": 1683391.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 5642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 104.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.9965744018554688, "kl": 0.19031595438718796, "learning_rate": 2.4211111111111114e-06, "loss": 0.0059, "num_tokens": 1683719.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 104.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033538401126862, "kl": 0.09217599406838417, "learning_rate": 2.4205555555555558e-06, "loss": 0.0046, "num_tokens": 1684071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 104.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.006839577574282885, "kl": 0.001325861900113523, "learning_rate": 2.42e-06, "loss": 0.0001, "num_tokens": 1684392.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 104.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.10298965871334076, "kl": 0.06948839873075485, "learning_rate": 2.4194444444444445e-06, "loss": 0.0037, "num_tokens": 1684715.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 104.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09583083540201187, "kl": 0.04332956299185753, "learning_rate": 2.418888888888889e-06, "loss": 0.0022, "num_tokens": 1685074.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 104.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.018772298470139503, "kl": 0.004984894301742315, "learning_rate": 2.4183333333333336e-06, "loss": 0.0002, "num_tokens": 1685336.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 104.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.30550527572631836, "kl": 0.07748527079820633, "learning_rate": 2.417777777777778e-06, "loss": 0.0039, "num_tokens": 1685690.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 104.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.031933993101119995, "kl": 0.015386232174932957, "learning_rate": 2.4172222222222223e-06, "loss": 0.0008, "num_tokens": 1686006.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 104.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.4076353907585144, "kl": 0.06270750612020493, "learning_rate": 2.4166666666666667e-06, "loss": 0.0031, "num_tokens": 1686222.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 104.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008656058926135302, "kl": 6.20037317276001e-05, "learning_rate": 2.4161111111111115e-06, "loss": 0.0, "num_tokens": 1686442.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 104.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 5.195723533630371, "kl": 0.06450555744231679, "learning_rate": 2.415555555555556e-06, "loss": 0.1159, "num_tokens": 1686744.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 104.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 3.149439573287964, "kl": 0.020604178309440613, "learning_rate": 2.415e-06, "loss": 0.0091, "num_tokens": 1687036.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.25, "completions/mean_terminated_length": 2.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 104.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07548654079437256, "kl": 0.0027597646694630384, "learning_rate": 2.4144444444444445e-06, "loss": 0.0002, "num_tokens": 1687249.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 104.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.5450998544692993, "kl": 0.04590262472629547, "learning_rate": 2.413888888888889e-06, "loss": 0.0023, "num_tokens": 1687493.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 104.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025138966739177704, "kl": 0.00012950301606906578, "learning_rate": 2.4133333333333337e-06, "loss": 0.0, "num_tokens": 1687749.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 104.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05727282911539078, "kl": 0.030481509864330292, "learning_rate": 2.412777777777778e-06, "loss": 0.0015, "num_tokens": 1688081.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 104.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.033339448273181915, "kl": 0.007462123641744256, "learning_rate": 2.4122222222222224e-06, "loss": 0.0004, "num_tokens": 1688361.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 104.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0223381444811821, "kl": 0.241273395717144, "learning_rate": 2.4116666666666667e-06, "loss": 0.0121, "num_tokens": 1688661.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 104.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.039133939892053604, "kl": 0.010824200697243214, "learning_rate": 2.411111111111111e-06, "loss": 0.0005, "num_tokens": 1688933.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 104.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.272336483001709, "kl": 0.1614397093653679, "learning_rate": 2.410555555555556e-06, "loss": -0.0069, "num_tokens": 1689272.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 104.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05807565525174141, "kl": 0.17410924285650253, "learning_rate": 2.4100000000000002e-06, "loss": 0.0087, "num_tokens": 1689580.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 104.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04709141328930855, "kl": 0.003314947010949254, "learning_rate": 2.4094444444444446e-06, "loss": 0.0002, "num_tokens": 1689814.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 104.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 10.523882865905762, "kl": 0.12869918026262894, "learning_rate": 2.4088888888888894e-06, "loss": 0.0334, "num_tokens": 1690103.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 104.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02473548986017704, "kl": 0.008235271088778973, "learning_rate": 2.4083333333333337e-06, "loss": 0.0004, "num_tokens": 1690409.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 104.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.01997358538210392, "kl": 0.037352846935391426, "learning_rate": 2.407777777777778e-06, "loss": 0.002, "num_tokens": 1690681.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 104.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1555347591638565, "kl": 0.0246950164437294, "learning_rate": 2.4072222222222224e-06, "loss": 0.0015, "num_tokens": 1691010.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 104.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.8091330528259277, "kl": 0.08400959149003029, "learning_rate": 2.4066666666666668e-06, "loss": 0.003, "num_tokens": 1691390.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 5669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 105.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.051798105239868, "kl": 0.35959960147738457, "learning_rate": 2.406111111111111e-06, "loss": 0.1869, "num_tokens": 1691781.0, "reward": 2.125, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 1.6007810831069946, "step": 5670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 105.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10795794427394867, "kl": 0.008396189659833908, "learning_rate": 2.4055555555555555e-06, "loss": 0.0004, "num_tokens": 1691995.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 105.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.155056953430176, "kl": 0.24594907462596893, "learning_rate": 2.4050000000000003e-06, "loss": 0.0211, "num_tokens": 1692352.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.25, "completions/mean_terminated_length": 2.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 105.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 10.589325904846191, "kl": 0.005925099132582545, "learning_rate": 2.4044444444444446e-06, "loss": 0.0931, "num_tokens": 1692565.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 5673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 105.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.045151855796575546, "kl": 0.06590669229626656, "learning_rate": 2.4038888888888894e-06, "loss": 0.0032, "num_tokens": 1692877.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01934110000729561, "kl": 0.2321009263396263, "learning_rate": 2.4033333333333338e-06, "loss": 0.0116, "num_tokens": 1693179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 105.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.043293341994285583, "kl": 0.002816291176714003, "learning_rate": 2.402777777777778e-06, "loss": 0.0001, "num_tokens": 1693433.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 105.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02390740066766739, "kl": 0.010304377879947424, "learning_rate": 2.4022222222222225e-06, "loss": 0.0005, "num_tokens": 1693693.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 105.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.010977563448250294, "kl": 0.00424610823392868, "learning_rate": 2.401666666666667e-06, "loss": 0.0002, "num_tokens": 1693959.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 105.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.08458377420902252, "kl": 0.016632890328764915, "learning_rate": 2.401111111111111e-06, "loss": 0.001, "num_tokens": 1694234.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 105.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03957729414105415, "kl": 0.002873337478376925, "learning_rate": 2.4005555555555555e-06, "loss": 0.0002, "num_tokens": 1694553.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 105.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.09877490997314453, "kl": 0.16673389077186584, "learning_rate": 2.4000000000000003e-06, "loss": 0.0083, "num_tokens": 1694885.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 105.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.032792329788208, "kl": 0.04489501379430294, "learning_rate": 2.3994444444444447e-06, "loss": -0.137, "num_tokens": 1695206.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 105.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.05131756141781807, "kl": 0.004570777760818601, "learning_rate": 2.398888888888889e-06, "loss": 0.0002, "num_tokens": 1695417.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 105.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.26255789399147034, "kl": 0.03243271540850401, "learning_rate": 2.398333333333334e-06, "loss": 0.0022, "num_tokens": 1695712.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 105.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.006460732314735651, "kl": 0.00015976428403519094, "learning_rate": 2.397777777777778e-06, "loss": 0.0, "num_tokens": 1695968.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 105.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.26486068964004517, "kl": 0.04904253035783768, "learning_rate": 2.3972222222222225e-06, "loss": 0.0028, "num_tokens": 1696303.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.019276602193713188, "kl": 0.0018949626246467233, "learning_rate": 2.396666666666667e-06, "loss": 0.0001, "num_tokens": 1696620.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 105.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036664181388914585, "kl": 0.00034692883491516113, "learning_rate": 2.3961111111111112e-06, "loss": 0.0, "num_tokens": 1696840.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 105.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.1873608827590942, "kl": 0.06736574321985245, "learning_rate": 2.3955555555555556e-06, "loss": -0.1489, "num_tokens": 1697225.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 5689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 105.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00680735195055604, "kl": 0.0003891885280609131, "learning_rate": 2.395e-06, "loss": 0.0, "num_tokens": 1697469.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 105.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.10363305360078812, "kl": 0.03074023500084877, "learning_rate": 2.3944444444444447e-06, "loss": 0.0016, "num_tokens": 1697689.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 105.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.19078943133354187, "kl": 0.036982940044254065, "learning_rate": 2.393888888888889e-06, "loss": 0.0018, "num_tokens": 1698005.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06702293455600739, "kl": 0.020718643674626946, "learning_rate": 2.3933333333333334e-06, "loss": 0.001, "num_tokens": 1698295.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 105.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.010925043374300003, "kl": 0.0013319129939191043, "learning_rate": 2.392777777777778e-06, "loss": 0.0001, "num_tokens": 1698562.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 105.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01363708358258009, "kl": 0.04541236162185669, "learning_rate": 2.3922222222222226e-06, "loss": 0.0023, "num_tokens": 1699032.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 105.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.05981707572937, "kl": 0.012357788626104593, "learning_rate": 2.391666666666667e-06, "loss": 0.0032, "num_tokens": 1699359.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 105.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007758021238259971, "kl": 5.40614128112793e-05, "learning_rate": 2.3911111111111113e-06, "loss": 0.0, "num_tokens": 1699579.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 105.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.2829766273498535, "kl": 0.188794806599617, "learning_rate": 2.3905555555555556e-06, "loss": 0.008, "num_tokens": 1699859.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 5698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 105.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06076011061668396, "kl": 0.004141840385273099, "learning_rate": 2.39e-06, "loss": 0.0002, "num_tokens": 1700129.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 105.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.00879549141973257, "kl": 0.008956201374530792, "learning_rate": 2.3894444444444443e-06, "loss": 0.0004, "num_tokens": 1700365.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 105.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03467768058180809, "kl": 0.007838812423869967, "learning_rate": 2.388888888888889e-06, "loss": 0.0004, "num_tokens": 1700666.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 105.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.32322359085083, "kl": 0.1651766002178192, "learning_rate": 2.3883333333333335e-06, "loss": 0.0952, "num_tokens": 1700981.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 105.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.4160897731781006, "kl": 0.053353358060121536, "learning_rate": 2.387777777777778e-06, "loss": 0.0027, "num_tokens": 1701270.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 105.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.09073541313409805, "kl": 0.054006654769182205, "learning_rate": 2.3872222222222226e-06, "loss": 0.0027, "num_tokens": 1701643.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 105.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07216142863035202, "kl": 0.030473683029413223, "learning_rate": 2.386666666666667e-06, "loss": 0.0015, "num_tokens": 1702006.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 105.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.7923154234886169, "kl": 0.21578184142708778, "learning_rate": 2.3861111111111113e-06, "loss": 0.0115, "num_tokens": 1702381.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 105.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076041147112846375, "kl": 0.032613611314445734, "learning_rate": 2.3855555555555557e-06, "loss": 0.0018, "num_tokens": 1702653.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 105.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.1934869289398193, "kl": 0.12597009539604187, "learning_rate": 2.385e-06, "loss": -0.004, "num_tokens": 1702985.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 105.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 6.614010810852051, "kl": 0.05254093470284715, "learning_rate": 2.3844444444444444e-06, "loss": 0.0738, "num_tokens": 1703273.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 105.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.4073199927806854, "kl": 0.0692074413964292, "learning_rate": 2.383888888888889e-06, "loss": 0.0037, "num_tokens": 1703571.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 105.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.12952443957328796, "kl": 0.025446251966059208, "learning_rate": 2.3833333333333335e-06, "loss": 0.0013, "num_tokens": 1703855.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 105.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0765608623623848, "kl": 0.03488257247954607, "learning_rate": 2.382777777777778e-06, "loss": 0.0017, "num_tokens": 1704189.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.010129240341484547, "kl": 0.0020021115196868777, "learning_rate": 2.3822222222222222e-06, "loss": 0.0001, "num_tokens": 1704469.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 105.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.3911004066467285, "kl": 0.12980439513921738, "learning_rate": 2.381666666666667e-06, "loss": 0.0065, "num_tokens": 1704810.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 105.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.18305377662181854, "kl": 0.019910499220713973, "learning_rate": 2.3811111111111114e-06, "loss": 0.001, "num_tokens": 1705114.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 105.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.015292679890990257, "kl": 0.0009461307781748474, "learning_rate": 2.3805555555555557e-06, "loss": 0.0, "num_tokens": 1705348.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 105.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09411591291427612, "kl": 0.029015840031206608, "learning_rate": 2.38e-06, "loss": 0.0015, "num_tokens": 1705621.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 105.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.578662395477295, "kl": 0.022182319313287735, "learning_rate": 2.3794444444444444e-06, "loss": 0.2197, "num_tokens": 1705958.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 5718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 105.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07210088521242142, "kl": 0.03758269175887108, "learning_rate": 2.3788888888888892e-06, "loss": 0.0019, "num_tokens": 1706250.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 105.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.697019338607788, "kl": 0.1536034643650055, "learning_rate": 2.3783333333333336e-06, "loss": -0.1221, "num_tokens": 1706549.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 105.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.044653408229351044, "kl": 0.006373573560267687, "learning_rate": 2.377777777777778e-06, "loss": 0.0003, "num_tokens": 1706838.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 105.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.171308755874634, "kl": 0.9225605428218842, "learning_rate": 2.3772222222222223e-06, "loss": 0.1144, "num_tokens": 1707165.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 105.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01601611077785492, "kl": 0.0034845881164073944, "learning_rate": 2.3766666666666666e-06, "loss": 0.0002, "num_tokens": 1707425.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 106.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03060140088200569, "kl": 0.003006672137416899, "learning_rate": 2.3761111111111114e-06, "loss": 0.0002, "num_tokens": 1707685.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 106.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08410724252462387, "kl": 0.0057574117090553045, "learning_rate": 2.3755555555555558e-06, "loss": 0.0003, "num_tokens": 1707908.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 106.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.3053275346755981, "kl": 0.46230063028633595, "learning_rate": 2.375e-06, "loss": 0.1033, "num_tokens": 1708234.0, "reward": 6.75, "reward_std": 1.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 1.5, "step": 5726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 106.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 19.848894119262695, "kl": 3.188643192872405, "learning_rate": 2.374444444444445e-06, "loss": 0.2543, "num_tokens": 1708511.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 106.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03114287741482258, "kl": 0.06825514882802963, "learning_rate": 2.3738888888888893e-06, "loss": 0.0033, "num_tokens": 1708808.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 106.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 1.3632651567459106, "kl": 0.1956938225775957, "learning_rate": 2.3733333333333336e-06, "loss": 0.0106, "num_tokens": 1709144.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 54.75, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 106.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.5344648361206055, "kl": 0.07306164316833019, "learning_rate": 2.372777777777778e-06, "loss": 0.0249, "num_tokens": 1709599.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 5730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 106.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.9264031648635864, "kl": 0.20277749001979828, "learning_rate": 2.3722222222222223e-06, "loss": -0.0447, "num_tokens": 1709943.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 106.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.15245340764522552, "kl": 0.02993644494563341, "learning_rate": 2.3716666666666667e-06, "loss": 0.0015, "num_tokens": 1710235.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 106.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03028515726327896, "kl": 0.006935416386113502, "learning_rate": 2.371111111111111e-06, "loss": 0.0003, "num_tokens": 1710531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 106.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.023131053894758224, "kl": 0.0009133219864452258, "learning_rate": 2.370555555555556e-06, "loss": 0.0, "num_tokens": 1710765.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 106.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.024490680545568466, "kl": 0.010474671609699726, "learning_rate": 2.37e-06, "loss": 0.0005, "num_tokens": 1711066.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 106.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.13705697655677795, "kl": 0.10301698744297028, "learning_rate": 2.369444444444445e-06, "loss": 0.0052, "num_tokens": 1711424.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 106.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.004659468773752451, "kl": 0.00014275312787503935, "learning_rate": 2.3688888888888893e-06, "loss": 0.0, "num_tokens": 1711680.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 106.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.06929275393486023, "kl": 0.02091584401205182, "learning_rate": 2.3683333333333337e-06, "loss": 0.001, "num_tokens": 1711971.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 106.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.023525772616267204, "kl": 0.16994114220142365, "learning_rate": 2.367777777777778e-06, "loss": 0.0085, "num_tokens": 1712279.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 106.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.021809855476021767, "kl": 0.0018775941571220756, "learning_rate": 2.3672222222222224e-06, "loss": 0.0001, "num_tokens": 1712593.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 106.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05740426853299141, "kl": 0.03520657867193222, "learning_rate": 2.3666666666666667e-06, "loss": 0.0018, "num_tokens": 1712927.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 106.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05906430631875992, "kl": 0.04382525943219662, "learning_rate": 2.366111111111111e-06, "loss": 0.0022, "num_tokens": 1713285.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 106.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.2236565798521042, "kl": 0.12012775987386703, "learning_rate": 2.365555555555556e-06, "loss": 0.006, "num_tokens": 1713649.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 106.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.009589975699782372, "kl": 0.008670933544635773, "learning_rate": 2.3650000000000002e-06, "loss": 0.0004, "num_tokens": 1713885.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 106.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.025257421657443047, "kl": 0.002307252259925008, "learning_rate": 2.3644444444444446e-06, "loss": 0.0001, "num_tokens": 1714145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 106.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.1536567211151123, "kl": 0.06873945146799088, "learning_rate": 2.3638888888888894e-06, "loss": 0.3465, "num_tokens": 1714378.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 5746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 106.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03494761884212494, "kl": 0.002915158780524507, "learning_rate": 2.3633333333333337e-06, "loss": 0.0002, "num_tokens": 1714640.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 106.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.009981038980185986, "kl": 0.0013902049977332354, "learning_rate": 2.362777777777778e-06, "loss": 0.0001, "num_tokens": 1714924.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 106.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.21880179643630981, "kl": 0.2604158893227577, "learning_rate": 2.3622222222222224e-06, "loss": 0.013, "num_tokens": 1715223.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 106.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.8220419883728027, "kl": 0.2532660514116287, "learning_rate": 2.3616666666666668e-06, "loss": 0.0264, "num_tokens": 1715533.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 106.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.020037278532981873, "kl": 0.00934132281690836, "learning_rate": 2.361111111111111e-06, "loss": 0.0005, "num_tokens": 1715845.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 106.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.405122995376587, "kl": 0.025613122968934476, "learning_rate": 2.3605555555555555e-06, "loss": 0.0052, "num_tokens": 1716115.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 106.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.18898683786392212, "kl": 0.05479826591908932, "learning_rate": 2.3600000000000003e-06, "loss": 0.0028, "num_tokens": 1716433.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 106.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.2743260860443115, "kl": 0.29593825340270996, "learning_rate": 2.3594444444444446e-06, "loss": -0.059, "num_tokens": 1716714.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 106.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.4411561489105225, "kl": 0.09531471878290176, "learning_rate": 2.358888888888889e-06, "loss": 0.0226, "num_tokens": 1717083.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 106.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.17575040459632874, "kl": 0.053868941962718964, "learning_rate": 2.3583333333333338e-06, "loss": 0.0028, "num_tokens": 1717417.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 106.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.11989647895097733, "kl": 0.012817461043596268, "learning_rate": 2.357777777777778e-06, "loss": 0.0006, "num_tokens": 1717677.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 106.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00609621312469244, "kl": 0.0005812495946884155, "learning_rate": 2.3572222222222225e-06, "loss": 0.0, "num_tokens": 1717889.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 106.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.034360215067863464, "kl": 0.006881481967866421, "learning_rate": 2.356666666666667e-06, "loss": 0.0003, "num_tokens": 1718149.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 106.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.08644485473632812, "kl": 0.006850443780422211, "learning_rate": 2.356111111111111e-06, "loss": 0.0003, "num_tokens": 1718369.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 106.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05234919860959053, "kl": 0.059454167261719704, "learning_rate": 2.3555555555555555e-06, "loss": 0.003, "num_tokens": 1718696.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 106.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 6.460033416748047, "kl": 0.009891678346320987, "learning_rate": 2.355e-06, "loss": 0.3039, "num_tokens": 1718989.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 106.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.7461588382720947, "kl": 0.11997375264763832, "learning_rate": 2.3544444444444447e-06, "loss": 0.0066, "num_tokens": 1719274.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 106.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02506357431411743, "kl": 0.0005575179820880294, "learning_rate": 2.353888888888889e-06, "loss": 0.0, "num_tokens": 1719487.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 106.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.06410881876945496, "kl": 0.012203714810311794, "learning_rate": 2.3533333333333334e-06, "loss": 0.0006, "num_tokens": 1719779.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 106.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.421283721923828, "kl": 0.061153936199843884, "learning_rate": 2.352777777777778e-06, "loss": 0.021, "num_tokens": 1720088.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 106.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 6.159648895263672, "kl": 1.5203693620860577, "learning_rate": 2.3522222222222225e-06, "loss": 0.0152, "num_tokens": 1720542.0, "reward": 1.75, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.4433757066726685, "step": 5767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 106.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 5.124983787536621, "kl": 0.04033631086349487, "learning_rate": 2.351666666666667e-06, "loss": 0.0572, "num_tokens": 1720791.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 106.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.00500383460894227, "kl": 0.0005019232630729675, "learning_rate": 2.3511111111111112e-06, "loss": 0.0, "num_tokens": 1721039.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 106.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06995708495378494, "kl": 0.029155928641557693, "learning_rate": 2.3505555555555556e-06, "loss": 0.0016, "num_tokens": 1721383.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 106.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.13819153606891632, "kl": 0.038990022614598274, "learning_rate": 2.35e-06, "loss": 0.002, "num_tokens": 1721670.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 106.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05095929279923439, "kl": 0.002921677427366376, "learning_rate": 2.3494444444444447e-06, "loss": 0.0002, "num_tokens": 1721892.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 106.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.049554381519556046, "kl": 0.026146933436393738, "learning_rate": 2.348888888888889e-06, "loss": 0.0014, "num_tokens": 1722162.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 106.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.026631832122802734, "kl": 0.0025556276086717844, "learning_rate": 2.3483333333333334e-06, "loss": 0.0001, "num_tokens": 1722486.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 106.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038923821412026882, "kl": 0.0018814844661392272, "learning_rate": 2.347777777777778e-06, "loss": 0.0001, "num_tokens": 1722766.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 106.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.5978940725326538, "kl": 0.5755023658275604, "learning_rate": 2.3472222222222226e-06, "loss": 0.0464, "num_tokens": 1723106.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 106.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02107800357043743, "kl": 0.009917078074067831, "learning_rate": 2.346666666666667e-06, "loss": 0.0005, "num_tokens": 1723379.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 107.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.029854748398065567, "kl": 0.0046214081812649965, "learning_rate": 2.3461111111111113e-06, "loss": 0.0002, "num_tokens": 1723645.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947212353348732, "kl": 0.049269286915659904, "learning_rate": 2.3455555555555556e-06, "loss": 0.0024, "num_tokens": 1723936.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 107.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02916792221367359, "kl": 0.005674648564308882, "learning_rate": 2.345e-06, "loss": 0.0003, "num_tokens": 1724248.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 107.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.673243761062622, "kl": 0.04141937755048275, "learning_rate": 2.3444444444444448e-06, "loss": 0.0131, "num_tokens": 1724565.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.037117380648851395, "kl": 0.24277297407388687, "learning_rate": 2.343888888888889e-06, "loss": 0.0121, "num_tokens": 1724865.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 90.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 90.0, "completions/mean_terminated_length": 34.66666793823242, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 107.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.6891608238220215, "kl": 0.16050830483436584, "learning_rate": 2.3433333333333335e-06, "loss": 0.5359, "num_tokens": 1725445.0, "reward": 3.799999952316284, "reward_std": 4.982635974884033, "rewards/reward_combined/mean": 3.799999952316284, "rewards/reward_combined/std": 4.982636451721191, "step": 5783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 107.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.003059547394514084, "kl": 0.00014421343803405762, "learning_rate": 2.342777777777778e-06, "loss": 0.0, "num_tokens": 1725657.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 107.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1836860477924347, "kl": 0.03857291303575039, "learning_rate": 2.342222222222222e-06, "loss": 0.0025, "num_tokens": 1725923.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 107.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04004674032330513, "kl": 0.02117378916591406, "learning_rate": 2.341666666666667e-06, "loss": 0.0011, "num_tokens": 1726217.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 107.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.2426217794418335, "kl": 0.0735059529542923, "learning_rate": 2.3411111111111113e-06, "loss": -0.1467, "num_tokens": 1726602.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 5787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 107.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03812981769442558, "kl": 0.0040266482392326, "learning_rate": 2.3405555555555557e-06, "loss": 0.0002, "num_tokens": 1726862.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 107.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.024881290271878242, "kl": 0.0016295582172460854, "learning_rate": 2.3400000000000005e-06, "loss": 0.0001, "num_tokens": 1727129.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 107.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03902464732527733, "kl": 0.020565229235216975, "learning_rate": 2.339444444444445e-06, "loss": 0.0012, "num_tokens": 1727430.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.04755730181932449, "kl": 0.007827440742403269, "learning_rate": 2.338888888888889e-06, "loss": 0.0004, "num_tokens": 1727720.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.11506816744804382, "kl": 0.058151913806796074, "learning_rate": 2.3383333333333335e-06, "loss": 0.0026, "num_tokens": 1728019.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 107.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.5549845695495605, "kl": 0.3328571319580078, "learning_rate": 2.337777777777778e-06, "loss": -0.1304, "num_tokens": 1728375.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03803160786628723, "kl": 0.004186135716736317, "learning_rate": 2.3372222222222222e-06, "loss": 0.0002, "num_tokens": 1728699.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 107.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 5.095707416534424, "kl": 0.20344921946525574, "learning_rate": 2.3366666666666666e-06, "loss": 0.0084, "num_tokens": 1729007.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 107.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.032316964119672775, "kl": 0.04861418157815933, "learning_rate": 2.3361111111111114e-06, "loss": 0.0024, "num_tokens": 1729479.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 107.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09154286980628967, "kl": 0.021268797107040882, "learning_rate": 2.3355555555555557e-06, "loss": 0.0011, "num_tokens": 1729768.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 79.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 79.75, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 107.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.8132514953613281, "kl": 0.05866039916872978, "learning_rate": 2.3350000000000005e-06, "loss": 0.4252, "num_tokens": 1730339.0, "reward": 5.300000190734863, "reward_std": 5.399999618530273, "rewards/reward_combined/mean": 5.300000190734863, "rewards/reward_combined/std": 5.40000057220459, "step": 5798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 107.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.16277068853378296, "kl": 0.08349389210343361, "learning_rate": 2.334444444444445e-06, "loss": 0.0042, "num_tokens": 1730707.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 107.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.054548054933547974, "kl": 0.023101845756173134, "learning_rate": 2.3338888888888892e-06, "loss": 0.0012, "num_tokens": 1731035.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 107.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05309860035777092, "kl": 0.061673738062381744, "learning_rate": 2.3333333333333336e-06, "loss": 0.0031, "num_tokens": 1731312.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 107.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.019904647022485733, "kl": 0.029820837080478668, "learning_rate": 2.332777777777778e-06, "loss": 0.0015, "num_tokens": 1731528.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 107.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.016115298494696617, "kl": 0.012816570699214935, "learning_rate": 2.3322222222222223e-06, "loss": 0.0006, "num_tokens": 1731840.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 107.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.2066686451435089, "kl": 0.07312379777431488, "learning_rate": 2.3316666666666666e-06, "loss": 0.0035, "num_tokens": 1732176.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 107.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.008337294682860374, "kl": 0.00046882529568392783, "learning_rate": 2.331111111111111e-06, "loss": 0.0, "num_tokens": 1732392.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 107.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.11589283496141434, "kl": 0.01985331401374424, "learning_rate": 2.3305555555555558e-06, "loss": 0.001, "num_tokens": 1732612.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 107.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07790113985538483, "kl": 0.00991291319951415, "learning_rate": 2.33e-06, "loss": 0.0005, "num_tokens": 1732904.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 107.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.2127726078033447, "kl": 0.1364831179380417, "learning_rate": 2.329444444444445e-06, "loss": -0.0106, "num_tokens": 1733259.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 5808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 107.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.909634590148926, "kl": 1.25171709805727, "learning_rate": 2.3288888888888893e-06, "loss": 0.0705, "num_tokens": 1733610.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 5809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 107.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.044010888785123825, "kl": 0.001714371086563915, "learning_rate": 2.3283333333333336e-06, "loss": 0.0001, "num_tokens": 1733876.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 107.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08355581015348434, "kl": 0.020125368610024452, "learning_rate": 2.327777777777778e-06, "loss": 0.0011, "num_tokens": 1734165.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 107.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.42019233107566833, "kl": 0.09688074514269829, "learning_rate": 2.3272222222222223e-06, "loss": 0.0051, "num_tokens": 1734465.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 107.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.021346475929021835, "kl": 0.0038149358006194234, "learning_rate": 2.3266666666666667e-06, "loss": 0.0002, "num_tokens": 1734745.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 107.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.003642988856881857, "kl": 0.0032901540398597717, "learning_rate": 2.326111111111111e-06, "loss": 0.0002, "num_tokens": 1735005.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 107.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.02244674786925316, "kl": 0.004156402312219143, "learning_rate": 2.325555555555556e-06, "loss": 0.0002, "num_tokens": 1735265.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 107.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.2421159744262695, "kl": 0.24025265872478485, "learning_rate": 2.325e-06, "loss": -0.1089, "num_tokens": 1735588.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 5816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 107.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 1.2355631589889526, "kl": 0.34938568493817, "learning_rate": 2.3244444444444445e-06, "loss": -0.0375, "num_tokens": 1735846.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 107.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.054172638803720474, "kl": 0.025564534589648247, "learning_rate": 2.3238888888888893e-06, "loss": 0.0013, "num_tokens": 1736171.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 107.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.03957649692893028, "kl": 0.0022131119621917605, "learning_rate": 2.3233333333333337e-06, "loss": 0.0001, "num_tokens": 1736404.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 107.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03478916734457016, "kl": 0.0024515882250852883, "learning_rate": 2.322777777777778e-06, "loss": 0.0001, "num_tokens": 1736615.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 107.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.011827087961137295, "kl": 0.0011846072738990188, "learning_rate": 2.3222222222222224e-06, "loss": 0.0001, "num_tokens": 1736897.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 107.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.009398079477250576, "kl": 0.008807599544525146, "learning_rate": 2.3216666666666667e-06, "loss": 0.0004, "num_tokens": 1737133.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 107.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.016432318836450577, "kl": 0.0012397616519592702, "learning_rate": 2.321111111111111e-06, "loss": 0.0001, "num_tokens": 1737393.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 107.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08510641008615494, "kl": 0.04104503057897091, "learning_rate": 2.3205555555555555e-06, "loss": 0.0021, "num_tokens": 1737665.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 107.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.023451462388038635, "kl": 0.009075218811631203, "learning_rate": 2.3200000000000002e-06, "loss": 0.0005, "num_tokens": 1737937.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 107.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.12314886599779129, "kl": 0.016502720303833485, "learning_rate": 2.3194444444444446e-06, "loss": 0.0013, "num_tokens": 1738172.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5826 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 107.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.180518865585327, "kl": 0.1273084282875061, "learning_rate": 2.318888888888889e-06, "loss": -0.0691, "num_tokens": 1738506.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 107.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05052276328206062, "kl": 0.011836867081001401, "learning_rate": 2.3183333333333337e-06, "loss": 0.0006, "num_tokens": 1738804.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 107.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.8808985948562622, "kl": 0.1565047912299633, "learning_rate": 2.317777777777778e-06, "loss": 0.008, "num_tokens": 1739180.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 107.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0691770613193512, "kl": 0.012700721621513367, "learning_rate": 2.3172222222222224e-06, "loss": 0.0006, "num_tokens": 1739510.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 107.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 3.488523006439209, "kl": 0.7153765112161636, "learning_rate": 2.316666666666667e-06, "loss": 0.0358, "num_tokens": 1739798.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 108.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01480660866945982, "kl": 0.00035792289418168366, "learning_rate": 2.316111111111111e-06, "loss": 0.0, "num_tokens": 1740054.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5832 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.01666666753590107, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01666666753590107, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 108.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.842832565307617, "kl": 0.08850051835179329, "learning_rate": 2.3155555555555555e-06, "loss": 0.0342, "num_tokens": 1740343.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 5833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 108.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09923389554023743, "kl": 0.03482908569276333, "learning_rate": 2.3150000000000003e-06, "loss": 0.0019, "num_tokens": 1740623.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 108.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 4.542205333709717, "kl": 0.7719307839870453, "learning_rate": 2.3144444444444446e-06, "loss": 0.0354, "num_tokens": 1740921.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 108.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.812767744064331, "kl": 0.2872639670968056, "learning_rate": 2.313888888888889e-06, "loss": -0.0019, "num_tokens": 1741229.0, "reward": 4.375, "reward_std": 4.190763473510742, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.190763473510742, "step": 5836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 108.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01992671750485897, "kl": 0.02975419908761978, "learning_rate": 2.3133333333333333e-06, "loss": 0.0015, "num_tokens": 1741445.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 108.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.6070327758789062, "kl": 0.040311574935913086, "learning_rate": 2.312777777777778e-06, "loss": 0.0001, "num_tokens": 1741780.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 108.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.017747828736901283, "kl": 0.003142915549688041, "learning_rate": 2.3122222222222225e-06, "loss": 0.0001, "num_tokens": 1742057.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 108.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 0.9838048219680786, "kl": 0.15467847138643265, "learning_rate": 2.311666666666667e-06, "loss": -0.1756, "num_tokens": 1742442.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 5840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 108.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.021793050691485405, "kl": 0.002154845278710127, "learning_rate": 2.311111111111111e-06, "loss": 0.0001, "num_tokens": 1742759.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 108.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.074839748442173, "kl": 0.015805783681571484, "learning_rate": 2.3105555555555556e-06, "loss": 0.0008, "num_tokens": 1743029.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 108.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03957245498895645, "kl": 0.07279252260923386, "learning_rate": 2.3100000000000003e-06, "loss": 0.0036, "num_tokens": 1743322.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 108.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05490855127573013, "kl": 0.008196674869395792, "learning_rate": 2.3094444444444447e-06, "loss": 0.0004, "num_tokens": 1743633.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 108.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.006698533892631531, "kl": 0.0007755756378173828, "learning_rate": 2.308888888888889e-06, "loss": 0.0, "num_tokens": 1743845.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 108.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 3.1662638187408447, "kl": 0.2746245265007019, "learning_rate": 2.3083333333333334e-06, "loss": 0.0139, "num_tokens": 1744129.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 108.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.018880562856793404, "kl": 0.006124564446508884, "learning_rate": 2.3077777777777778e-06, "loss": 0.0003, "num_tokens": 1744441.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 108.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 2.2042155265808105, "kl": 0.11802228167653084, "learning_rate": 2.3072222222222225e-06, "loss": 0.1454, "num_tokens": 1744806.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 5848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 108.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.03350745514035225, "kl": 0.019757229834794998, "learning_rate": 2.306666666666667e-06, "loss": 0.001, "num_tokens": 1745094.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 108.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04460775479674339, "kl": 0.05793546140193939, "learning_rate": 2.3061111111111112e-06, "loss": 0.0029, "num_tokens": 1745414.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 108.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01727634109556675, "kl": 0.0015140151954255998, "learning_rate": 2.305555555555556e-06, "loss": 0.0001, "num_tokens": 1745688.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 108.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 7.871547698974609, "kl": 0.03024229221045971, "learning_rate": 2.3050000000000004e-06, "loss": 0.1377, "num_tokens": 1745955.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 5852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 108.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002206312055932358, "kl": 1.2286007404327393e-05, "learning_rate": 2.3044444444444447e-06, "loss": 0.0, "num_tokens": 1746175.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 108.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.014493185095489025, "kl": 0.04444310744293034, "learning_rate": 2.303888888888889e-06, "loss": 0.0022, "num_tokens": 1746451.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5854 }, { "clip_ratio/high_max": 0.017543859779834747, "clip_ratio/high_mean": 0.017543859779834747, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017543859779834747, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 108.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.592961311340332, "kl": 0.1612509936094284, "learning_rate": 2.3033333333333334e-06, "loss": 0.0322, "num_tokens": 1746795.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 108.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04418332129716873, "kl": 0.004384535131976008, "learning_rate": 2.302777777777778e-06, "loss": 0.0002, "num_tokens": 1747084.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 108.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.40577051043510437, "kl": 0.06118386052548885, "learning_rate": 2.302222222222222e-06, "loss": 0.0034, "num_tokens": 1747428.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 108.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.040121667087078094, "kl": 0.00228995643556118, "learning_rate": 2.301666666666667e-06, "loss": 0.0001, "num_tokens": 1747661.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 108.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05861106887459755, "kl": 0.012743785046041012, "learning_rate": 2.3011111111111113e-06, "loss": 0.0006, "num_tokens": 1747967.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 108.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01555357500910759, "kl": 0.003490954637527466, "learning_rate": 2.300555555555556e-06, "loss": 0.0002, "num_tokens": 1748227.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 108.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009012174792587757, "kl": 0.001056483160937205, "learning_rate": 2.3000000000000004e-06, "loss": 0.0001, "num_tokens": 1748511.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 108.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.022230396047234535, "kl": 0.001808389090001583, "learning_rate": 2.2994444444444448e-06, "loss": 0.0001, "num_tokens": 1748779.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 108.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 10.614951133728027, "kl": 0.11329230666160583, "learning_rate": 2.298888888888889e-06, "loss": 0.1674, "num_tokens": 1748993.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 5863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 108.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.7755922079086304, "kl": 0.09784675016999245, "learning_rate": 2.2983333333333335e-06, "loss": -0.2234, "num_tokens": 1749325.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 108.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.3857340812683105, "kl": 0.5362478792667389, "learning_rate": 2.297777777777778e-06, "loss": -0.0386, "num_tokens": 1749632.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 108.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.12239616364240646, "kl": 0.05591881461441517, "learning_rate": 2.297222222222222e-06, "loss": 0.0028, "num_tokens": 1750092.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 108.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03832441195845604, "kl": 0.0008699357495061122, "learning_rate": 2.2966666666666666e-06, "loss": 0.0, "num_tokens": 1750305.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 108.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.028372397646307945, "kl": 0.0008040661341510713, "learning_rate": 2.2961111111111113e-06, "loss": 0.0, "num_tokens": 1750562.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 108.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.12573294341564178, "kl": 0.018557171104475856, "learning_rate": 2.2955555555555557e-06, "loss": 0.0009, "num_tokens": 1750861.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 108.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.297116756439209, "kl": 0.0736316591501236, "learning_rate": 2.2950000000000005e-06, "loss": 0.0038, "num_tokens": 1751221.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 108.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.027007507160305977, "kl": 0.004805374541319907, "learning_rate": 2.294444444444445e-06, "loss": 0.0002, "num_tokens": 1751487.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 108.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 9.09406566619873, "kl": 0.06829983182251453, "learning_rate": 2.293888888888889e-06, "loss": 0.049, "num_tokens": 1751761.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5872 }, { "clip_ratio/high_max": 0.004999999888241291, "clip_ratio/high_mean": 0.004999999888241291, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004999999888241291, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 108.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.57108736038208, "kl": 0.07682021521031857, "learning_rate": 2.2933333333333335e-06, "loss": 0.037, "num_tokens": 1752155.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 5873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 108.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 5.11784553527832, "kl": 0.06591392681002617, "learning_rate": 2.292777777777778e-06, "loss": 0.0873, "num_tokens": 1752460.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 108.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.09630302339792252, "kl": 0.004422686994075775, "learning_rate": 2.2922222222222223e-06, "loss": 0.0003, "num_tokens": 1752714.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 108.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.10041671246290207, "kl": 0.039275165647268295, "learning_rate": 2.2916666666666666e-06, "loss": 0.0018, "num_tokens": 1753002.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 108.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.3086634576320648, "kl": 0.18983878940343857, "learning_rate": 2.2911111111111114e-06, "loss": 0.0095, "num_tokens": 1753309.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 108.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.023311452940106392, "kl": 0.0031690411269664764, "learning_rate": 2.2905555555555557e-06, "loss": 0.0002, "num_tokens": 1753567.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 108.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00932671781629324, "kl": 0.008856192231178284, "learning_rate": 2.29e-06, "loss": 0.0004, "num_tokens": 1753803.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 108.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.16420596837997437, "kl": 0.02283555641770363, "learning_rate": 2.289444444444445e-06, "loss": 0.0012, "num_tokens": 1754136.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 108.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04888879880309105, "kl": 0.012341694440692663, "learning_rate": 2.2888888888888892e-06, "loss": 0.0006, "num_tokens": 1754429.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 108.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06728731840848923, "kl": 0.17712334170937538, "learning_rate": 2.2883333333333336e-06, "loss": 0.0099, "num_tokens": 1754755.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 108.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.016920270398259163, "kl": 0.0021712490415666252, "learning_rate": 2.287777777777778e-06, "loss": 0.0001, "num_tokens": 1754977.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 108.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03793584927916527, "kl": 0.07664215564727783, "learning_rate": 2.2872222222222223e-06, "loss": 0.0038, "num_tokens": 1755343.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 108.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08954383432865143, "kl": 0.026871073059737682, "learning_rate": 2.2866666666666667e-06, "loss": 0.0014, "num_tokens": 1755662.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 109.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.24705274403095245, "kl": 0.0381810970720835, "learning_rate": 2.286111111111111e-06, "loss": 0.0023, "num_tokens": 1755928.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 109.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10834630578756332, "kl": 0.018002016935497522, "learning_rate": 2.285555555555556e-06, "loss": 0.0009, "num_tokens": 1756200.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 109.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.10682010650634766, "kl": 0.014810490887612104, "learning_rate": 2.285e-06, "loss": 0.0008, "num_tokens": 1756502.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 109.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.07121554017066956, "kl": 0.019576849415898323, "learning_rate": 2.2844444444444445e-06, "loss": 0.001, "num_tokens": 1756792.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 109.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05657891929149628, "kl": 0.021731029904913157, "learning_rate": 2.2838888888888893e-06, "loss": 0.0011, "num_tokens": 1757076.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 109.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.004642594140022993, "kl": 0.00026054382033180445, "learning_rate": 2.2833333333333336e-06, "loss": 0.0, "num_tokens": 1757296.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005362896481528878, "kl": 3.422051668167114e-05, "learning_rate": 2.282777777777778e-06, "loss": 0.0, "num_tokens": 1757516.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 109.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.11602697521448135, "kl": 0.20188168436288834, "learning_rate": 2.2822222222222223e-06, "loss": 0.0101, "num_tokens": 1757854.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 109.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.42941951751709, "kl": 0.03967230208218098, "learning_rate": 2.2816666666666667e-06, "loss": 0.1543, "num_tokens": 1758204.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 109.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.04069337248802185, "kl": 0.006148248969111592, "learning_rate": 2.281111111111111e-06, "loss": 0.0003, "num_tokens": 1758500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 109.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.021574154496192932, "kl": 0.049264393746852875, "learning_rate": 2.280555555555556e-06, "loss": 0.0027, "num_tokens": 1758776.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 109.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.44032105803489685, "kl": 0.10890871193259954, "learning_rate": 2.28e-06, "loss": 0.0058, "num_tokens": 1759068.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.009692159481346607, "kl": 0.008762218058109283, "learning_rate": 2.2794444444444445e-06, "loss": 0.0004, "num_tokens": 1759304.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 109.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.254539966583252, "kl": 0.2867497578263283, "learning_rate": 2.278888888888889e-06, "loss": 0.012, "num_tokens": 1759648.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 5899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 109.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.020193960517644882, "kl": 0.012307190336287022, "learning_rate": 2.2783333333333337e-06, "loss": 0.0006, "num_tokens": 1759908.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 109.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.025786899030208588, "kl": 0.003703321795910597, "learning_rate": 2.277777777777778e-06, "loss": 0.0002, "num_tokens": 1760229.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 109.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03755111247301102, "kl": 0.0025747695472091436, "learning_rate": 2.2772222222222224e-06, "loss": 0.0001, "num_tokens": 1760517.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 109.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.030178748071193695, "kl": 0.0007559453370049596, "learning_rate": 2.2766666666666668e-06, "loss": 0.0, "num_tokens": 1760783.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 109.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.027035286650061607, "kl": 0.0014110198244452477, "learning_rate": 2.276111111111111e-06, "loss": 0.0001, "num_tokens": 1761052.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 109.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.595728635787964, "kl": 0.16063790256157517, "learning_rate": 2.275555555555556e-06, "loss": -0.0061, "num_tokens": 1761341.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 5905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 109.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037989288102835417, "kl": 0.0032053142786026, "learning_rate": 2.2750000000000002e-06, "loss": 0.0002, "num_tokens": 1761601.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 109.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.038909267634153366, "kl": 0.006502642994746566, "learning_rate": 2.2744444444444446e-06, "loss": 0.0003, "num_tokens": 1761901.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 109.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.9845117330551147, "kl": 0.0861554965376854, "learning_rate": 2.273888888888889e-06, "loss": 0.0663, "num_tokens": 1762280.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04867205023765564, "kl": 0.0028351962682791054, "learning_rate": 2.2733333333333333e-06, "loss": 0.0001, "num_tokens": 1762534.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 109.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 5.127859115600586, "kl": 0.09900512173771858, "learning_rate": 2.272777777777778e-06, "loss": -0.0991, "num_tokens": 1762821.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 109.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.059423595666885376, "kl": 0.17464345693588257, "learning_rate": 2.2722222222222224e-06, "loss": 0.0087, "num_tokens": 1763130.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 109.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02159995585680008, "kl": 0.016935485182330012, "learning_rate": 2.271666666666667e-06, "loss": 0.0009, "num_tokens": 1763459.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 109.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.0613025426864624, "kl": 0.08768868446350098, "learning_rate": 2.2711111111111116e-06, "loss": 0.0813, "num_tokens": 1763895.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 109.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.0506815910339355, "kl": 0.04517035745084286, "learning_rate": 2.270555555555556e-06, "loss": 0.055, "num_tokens": 1764244.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 109.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06152228266000748, "kl": 0.04922429099678993, "learning_rate": 2.2700000000000003e-06, "loss": 0.0025, "num_tokens": 1764560.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 109.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.3406851291656494, "kl": 0.6140462160110474, "learning_rate": 2.2694444444444446e-06, "loss": -0.0417, "num_tokens": 1764844.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 5916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 109.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.45070219039917, "kl": 0.028138757683336735, "learning_rate": 2.268888888888889e-06, "loss": 0.1275, "num_tokens": 1765193.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 5917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 109.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.12882067263126373, "kl": 0.0073994523845613, "learning_rate": 2.2683333333333334e-06, "loss": 0.0004, "num_tokens": 1765457.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 109.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.006838653702288866, "kl": 0.0009372681379318237, "learning_rate": 2.2677777777777777e-06, "loss": 0.0, "num_tokens": 1765669.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 109.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.2016844749450684, "kl": 0.13347982615232468, "learning_rate": 2.2672222222222225e-06, "loss": -0.0226, "num_tokens": 1766006.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 109.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00788112636655569, "kl": 0.00022435486243921332, "learning_rate": 2.266666666666667e-06, "loss": 0.0, "num_tokens": 1766262.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 109.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.35934576392173767, "kl": 0.03974698483943939, "learning_rate": 2.2661111111111116e-06, "loss": 0.0024, "num_tokens": 1766474.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 109.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.01515404973179102, "kl": 0.0037307115271687508, "learning_rate": 2.265555555555556e-06, "loss": 0.0002, "num_tokens": 1766740.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.01084819994866848, "kl": 0.0002528578042984009, "learning_rate": 2.2650000000000003e-06, "loss": 0.0, "num_tokens": 1766952.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 109.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05169876664876938, "kl": 0.004847709788009524, "learning_rate": 2.2644444444444447e-06, "loss": 0.0003, "num_tokens": 1767200.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 109.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.01042342372238636, "kl": 0.002151593565940857, "learning_rate": 2.263888888888889e-06, "loss": 0.0001, "num_tokens": 1767456.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 109.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.5185983180999756, "kl": 0.27015598863363266, "learning_rate": 2.2633333333333334e-06, "loss": 0.1404, "num_tokens": 1767773.0, "reward": 3.25, "reward_std": 3.0686588287353516, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 3.0686588287353516, "step": 5927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 109.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.3368414640426636, "kl": 0.04871060885488987, "learning_rate": 2.2627777777777778e-06, "loss": 0.0029, "num_tokens": 1768009.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 109.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03102540783584118, "kl": 0.025160010904073715, "learning_rate": 2.262222222222222e-06, "loss": 0.0013, "num_tokens": 1768341.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 109.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 3.02008056640625, "kl": 0.05476264841854572, "learning_rate": 2.261666666666667e-06, "loss": -0.0692, "num_tokens": 1768706.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 109.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.599712371826172, "kl": 0.05883664824068546, "learning_rate": 2.2611111111111112e-06, "loss": 0.0464, "num_tokens": 1769040.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 109.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.016485651955008507, "kl": 0.012676356360316277, "learning_rate": 2.260555555555556e-06, "loss": 0.0006, "num_tokens": 1769352.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 109.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05215999856591225, "kl": 0.1376678068190813, "learning_rate": 2.2600000000000004e-06, "loss": 0.008, "num_tokens": 1769672.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 109.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03712412714958191, "kl": 0.005379386246204376, "learning_rate": 2.2594444444444447e-06, "loss": 0.0003, "num_tokens": 1769946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 109.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.029552988708019257, "kl": 0.0013674618967343122, "learning_rate": 2.258888888888889e-06, "loss": 0.0001, "num_tokens": 1770180.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 109.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.022398492321372032, "kl": 0.001862915582023561, "learning_rate": 2.2583333333333335e-06, "loss": 0.0001, "num_tokens": 1770494.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 109.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04464924708008766, "kl": 0.013192293234169483, "learning_rate": 2.257777777777778e-06, "loss": 0.0007, "num_tokens": 1770778.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 109.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.031193474307656288, "kl": 0.2721620798110962, "learning_rate": 2.257222222222222e-06, "loss": 0.0136, "num_tokens": 1771074.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 109.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 0.48038041591644287, "kl": 0.13822895847260952, "learning_rate": 2.2566666666666665e-06, "loss": 0.0069, "num_tokens": 1771534.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 5939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 110.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03039879910647869, "kl": 0.013079929165542126, "learning_rate": 2.2561111111111113e-06, "loss": 0.0007, "num_tokens": 1771850.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 110.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009366894140839577, "kl": 0.02395574632100761, "learning_rate": 2.2555555555555557e-06, "loss": 0.0014, "num_tokens": 1772122.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 110.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004154160269536078, "kl": 2.4937093257904053e-05, "learning_rate": 2.2550000000000004e-06, "loss": 0.0, "num_tokens": 1772342.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 110.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02722969837486744, "kl": 0.07369348034262657, "learning_rate": 2.254444444444445e-06, "loss": 0.0037, "num_tokens": 1772784.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 110.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.5569519996643066, "kl": 0.06880928366445005, "learning_rate": 2.253888888888889e-06, "loss": 0.0053, "num_tokens": 1773112.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 110.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.010714763775467873, "kl": 0.0021222957875579596, "learning_rate": 2.2533333333333335e-06, "loss": 0.0001, "num_tokens": 1773394.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 110.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.36709219217300415, "kl": 0.20609743148088455, "learning_rate": 2.252777777777778e-06, "loss": 0.0104, "num_tokens": 1773713.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 110.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07348138839006424, "kl": 0.03160404763184488, "learning_rate": 2.252222222222222e-06, "loss": 0.0016, "num_tokens": 1774013.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 110.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.026127615943551064, "kl": 0.0016930608544498682, "learning_rate": 2.2516666666666666e-06, "loss": 0.0001, "num_tokens": 1774281.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 110.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.06882958114147186, "kl": 0.019593375734984875, "learning_rate": 2.2511111111111113e-06, "loss": 0.001, "num_tokens": 1774599.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 110.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034451158717274666, "kl": 0.000252765414188616, "learning_rate": 2.2505555555555557e-06, "loss": 0.0, "num_tokens": 1774819.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 110.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.01441138330847025, "kl": 0.04845309630036354, "learning_rate": 2.25e-06, "loss": 0.0024, "num_tokens": 1775277.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 110.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.007889985106885433, "kl": 0.001247240463271737, "learning_rate": 2.249444444444445e-06, "loss": 0.0001, "num_tokens": 1775589.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 110.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.02623138576745987, "kl": 0.0030988093931227922, "learning_rate": 2.248888888888889e-06, "loss": 0.0002, "num_tokens": 1775914.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 110.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.01693568378686905, "kl": 0.0013090491411276162, "learning_rate": 2.2483333333333335e-06, "loss": 0.0001, "num_tokens": 1776174.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 110.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.017163613811135292, "kl": 0.0022517286706715822, "learning_rate": 2.247777777777778e-06, "loss": 0.0001, "num_tokens": 1776430.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 110.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.04599517956376076, "kl": 0.005249442998319864, "learning_rate": 2.2472222222222223e-06, "loss": 0.0004, "num_tokens": 1776647.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 110.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.41550734639167786, "kl": 0.14398552104830742, "learning_rate": 2.2466666666666666e-06, "loss": 0.0072, "num_tokens": 1776983.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.12507469952106476, "kl": 0.023559367284178734, "learning_rate": 2.2461111111111114e-06, "loss": 0.0013, "num_tokens": 1777269.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 110.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.18057510256767273, "kl": 0.04954231716692448, "learning_rate": 2.2455555555555557e-06, "loss": 0.0024, "num_tokens": 1777631.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.028630077838897705, "kl": 0.007201259752037004, "learning_rate": 2.245e-06, "loss": 0.0004, "num_tokens": 1777918.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 110.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.019973179325461388, "kl": 0.009513222612440586, "learning_rate": 2.2444444444444445e-06, "loss": 0.0005, "num_tokens": 1778230.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 110.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 3.5281026363372803, "kl": 0.15248388051986694, "learning_rate": 2.2438888888888892e-06, "loss": 0.2621, "num_tokens": 1778541.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 5962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 110.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 5.238336086273193, "kl": 0.04210846871137619, "learning_rate": 2.2433333333333336e-06, "loss": 0.0165, "num_tokens": 1778822.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 5963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 110.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.010368258692324162, "kl": 0.008508995175361633, "learning_rate": 2.242777777777778e-06, "loss": 0.0004, "num_tokens": 1779058.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 110.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00707264244556427, "kl": 0.0033807764993980527, "learning_rate": 2.2422222222222223e-06, "loss": 0.0002, "num_tokens": 1779326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5965 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.01315789483487606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01315789483487606, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 110.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.8372042179107666, "kl": 0.16162938252091408, "learning_rate": 2.2416666666666667e-06, "loss": -0.0428, "num_tokens": 1779664.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 110.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05508756265044212, "kl": 0.028000717982649803, "learning_rate": 2.2411111111111114e-06, "loss": 0.0012, "num_tokens": 1779914.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 110.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.005292836576700211, "kl": 0.00015305280248867348, "learning_rate": 2.240555555555556e-06, "loss": 0.0, "num_tokens": 1780170.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 110.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01174973975867033, "kl": 0.0022232607007026672, "learning_rate": 2.24e-06, "loss": 0.0001, "num_tokens": 1780376.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 110.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05574768781661987, "kl": 0.2506001815199852, "learning_rate": 2.2394444444444445e-06, "loss": 0.0125, "num_tokens": 1780676.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 110.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 4.895984649658203, "kl": 0.5500674471259117, "learning_rate": 2.238888888888889e-06, "loss": 0.0338, "num_tokens": 1780999.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 110.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 6.823928356170654, "kl": 0.04604010283946991, "learning_rate": 2.2383333333333336e-06, "loss": 0.3574, "num_tokens": 1781236.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 5972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 110.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07921598851680756, "kl": 0.12719879299402237, "learning_rate": 2.237777777777778e-06, "loss": 0.0065, "num_tokens": 1781561.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 110.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0758516937494278, "kl": 0.013077837880700827, "learning_rate": 2.2372222222222224e-06, "loss": 0.0007, "num_tokens": 1781834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 110.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0449879914522171, "kl": 0.07694065570831299, "learning_rate": 2.236666666666667e-06, "loss": 0.0038, "num_tokens": 1782203.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 110.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.08593840152025223, "kl": 0.14284632727503777, "learning_rate": 2.2361111111111115e-06, "loss": 0.0071, "num_tokens": 1782513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 110.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.11745216697454453, "kl": 0.12420114129781723, "learning_rate": 2.235555555555556e-06, "loss": 0.0062, "num_tokens": 1782849.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 110.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.08315028250217438, "kl": 0.07299698144197464, "learning_rate": 2.235e-06, "loss": 0.0036, "num_tokens": 1783150.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.5, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 110.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.6976733207702637, "kl": 0.08931900560855865, "learning_rate": 2.2344444444444446e-06, "loss": 0.0652, "num_tokens": 1783628.0, "reward": 3.75, "reward_std": 4.406434535980225, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 4.406435012817383, "step": 5979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 110.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.012297810055315495, "kl": 0.0006979058089200407, "learning_rate": 2.233888888888889e-06, "loss": 0.0, "num_tokens": 1783863.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 110.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.025241898372769356, "kl": 0.0036167949438095093, "learning_rate": 2.2333333333333333e-06, "loss": 0.0002, "num_tokens": 1784123.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 110.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09484078735113144, "kl": 0.02444450743496418, "learning_rate": 2.232777777777778e-06, "loss": 0.0012, "num_tokens": 1784422.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.021161450073122978, "kl": 0.003548164153471589, "learning_rate": 2.2322222222222224e-06, "loss": 0.0002, "num_tokens": 1784721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 110.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.042019110172986984, "kl": 0.004762112163007259, "learning_rate": 2.231666666666667e-06, "loss": 0.0002, "num_tokens": 1785010.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 110.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.07296665012836456, "kl": 0.02101836074143648, "learning_rate": 2.2311111111111115e-06, "loss": 0.001, "num_tokens": 1785280.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.2356027066707611, "kl": 0.053674353286623955, "learning_rate": 2.230555555555556e-06, "loss": 0.0029, "num_tokens": 1785571.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 110.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.6790454387664795, "kl": 0.19412690587341785, "learning_rate": 2.2300000000000002e-06, "loss": 0.1131, "num_tokens": 1785915.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 5987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 110.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.14883174002170563, "kl": 0.058524105697870255, "learning_rate": 2.2294444444444446e-06, "loss": 0.0026, "num_tokens": 1786241.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 110.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.054757047444581985, "kl": 0.012834566179662943, "learning_rate": 2.228888888888889e-06, "loss": 0.0006, "num_tokens": 1786532.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 110.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.11407911032438278, "kl": 0.006915999809280038, "learning_rate": 2.2283333333333333e-06, "loss": 0.0004, "num_tokens": 1786746.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 5990 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01785714365541935, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 110.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 4.559361934661865, "kl": 0.44862504303455353, "learning_rate": 2.2277777777777777e-06, "loss": 0.0557, "num_tokens": 1787030.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 5991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 110.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.047531869262456894, "kl": 0.015913192182779312, "learning_rate": 2.2272222222222225e-06, "loss": 0.0008, "num_tokens": 1787291.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 110.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.019445689395070076, "kl": 0.0017895608361868653, "learning_rate": 2.226666666666667e-06, "loss": 0.0001, "num_tokens": 1787561.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 111.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.20143389701843262, "kl": 0.03340585716068745, "learning_rate": 2.2261111111111116e-06, "loss": 0.0018, "num_tokens": 1787865.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 111.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06680736690759659, "kl": 0.019921076949685812, "learning_rate": 2.225555555555556e-06, "loss": 0.001, "num_tokens": 1788141.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 5995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 111.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.031808000057935715, "kl": 0.023084493353962898, "learning_rate": 2.2250000000000003e-06, "loss": 0.0012, "num_tokens": 1788490.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 111.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.07420177012681961, "kl": 0.03127262834459543, "learning_rate": 2.2244444444444447e-06, "loss": 0.0016, "num_tokens": 1788845.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 5997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 111.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005040796240791678, "kl": 3.141909837722778e-05, "learning_rate": 2.223888888888889e-06, "loss": 0.0, "num_tokens": 1789065.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 5998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 111.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05184130370616913, "kl": 0.03520709369331598, "learning_rate": 2.2233333333333334e-06, "loss": 0.0013, "num_tokens": 1789435.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 5999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 111.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.005197023041546345, "kl": 0.0005654962442349643, "learning_rate": 2.2227777777777777e-06, "loss": 0.0, "num_tokens": 1789670.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 111.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.006456303410232067, "kl": 0.0008095830562524498, "learning_rate": 2.222222222222222e-06, "loss": 0.0, "num_tokens": 1789930.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 111.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.020586607977747917, "kl": 0.009270508773624897, "learning_rate": 2.221666666666667e-06, "loss": 0.0005, "num_tokens": 1790242.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 111.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.8345088958740234, "kl": 0.15681308507919312, "learning_rate": 2.221111111111111e-06, "loss": 0.1938, "num_tokens": 1790612.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 111.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.299498081207275, "kl": 0.058540353551506996, "learning_rate": 2.220555555555556e-06, "loss": 0.119, "num_tokens": 1790905.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 111.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 3.583303213119507, "kl": 0.04641392081975937, "learning_rate": 2.2200000000000003e-06, "loss": 0.1323, "num_tokens": 1791271.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 6005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 111.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.016487352550029755, "kl": 0.0014634228427894413, "learning_rate": 2.2194444444444447e-06, "loss": 0.0001, "num_tokens": 1791559.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 111.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.01917976699769497, "kl": 0.004492318956181407, "learning_rate": 2.218888888888889e-06, "loss": 0.0002, "num_tokens": 1791851.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 111.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04586009308695793, "kl": 0.01107635023072362, "learning_rate": 2.2183333333333334e-06, "loss": 0.0006, "num_tokens": 1792123.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 75.75, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 111.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.5401326417922974, "kl": 0.046338689513504505, "learning_rate": 2.2177777777777778e-06, "loss": 0.4822, "num_tokens": 1792638.0, "reward": 4.125, "reward_std": 4.516174793243408, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 4.516174793243408, "step": 6009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 111.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.09289556741714478, "kl": 0.025145677849650383, "learning_rate": 2.217222222222222e-06, "loss": 0.0013, "num_tokens": 1792925.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 111.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.010575572028756142, "kl": 0.07254448905587196, "learning_rate": 2.216666666666667e-06, "loss": 0.0036, "num_tokens": 1793365.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 111.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.5492562055587769, "kl": 0.16654383391141891, "learning_rate": 2.2161111111111113e-06, "loss": -0.0437, "num_tokens": 1793722.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 111.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.309105396270752, "kl": 0.21283352375030518, "learning_rate": 2.2155555555555556e-06, "loss": 0.0827, "num_tokens": 1794063.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 6013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 111.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.046444907784461975, "kl": 0.015599881298840046, "learning_rate": 2.2150000000000004e-06, "loss": 0.0008, "num_tokens": 1794324.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012195121496915817, "clip_ratio/low_min": 0.012195121496915817, "clip_ratio/region_mean": 0.012195121496915817, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 111.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.370845794677734, "kl": 0.21976159512996674, "learning_rate": 2.2144444444444447e-06, "loss": 0.116, "num_tokens": 1794614.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 6015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 111.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07273008674383163, "kl": 0.0016966164112091064, "learning_rate": 2.213888888888889e-06, "loss": 0.0001, "num_tokens": 1794826.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 111.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 10.509722709655762, "kl": 2.0072591239586473, "learning_rate": 2.2133333333333335e-06, "loss": 0.1624, "num_tokens": 1795121.0, "reward": 5.0, "reward_std": 5.0, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 5.0, "step": 6017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 111.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.05892401188611984, "kl": 0.03905366361141205, "learning_rate": 2.212777777777778e-06, "loss": 0.0019, "num_tokens": 1795412.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 111.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.011106191202998161, "kl": 0.0010420647740829736, "learning_rate": 2.212222222222222e-06, "loss": 0.0, "num_tokens": 1795670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 111.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08389342576265335, "kl": 0.022280956618487835, "learning_rate": 2.211666666666667e-06, "loss": 0.0011, "num_tokens": 1796006.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 111.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.008685830049216747, "kl": 0.00016584346303716302, "learning_rate": 2.2111111111111113e-06, "loss": 0.0, "num_tokens": 1796276.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 111.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0933811366558075, "kl": 0.07217695191502571, "learning_rate": 2.2105555555555557e-06, "loss": 0.0035, "num_tokens": 1796630.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 111.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019042453495785594, "kl": 0.00028144093812443316, "learning_rate": 2.21e-06, "loss": 0.0, "num_tokens": 1796926.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 111.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.028299512341618538, "kl": 0.002031564712524414, "learning_rate": 2.209444444444445e-06, "loss": 0.0001, "num_tokens": 1797134.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 111.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.02766750566661358, "kl": 0.006831041071563959, "learning_rate": 2.208888888888889e-06, "loss": 0.0003, "num_tokens": 1797464.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 111.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.010297007858753204, "kl": 0.008467644453048706, "learning_rate": 2.2083333333333335e-06, "loss": 0.0004, "num_tokens": 1797700.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 111.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.011764590628445148, "kl": 0.0032117210794240236, "learning_rate": 2.207777777777778e-06, "loss": 0.0002, "num_tokens": 1797966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 111.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.04098737984895706, "kl": 0.022777595091611147, "learning_rate": 2.2072222222222222e-06, "loss": 0.0011, "num_tokens": 1798284.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 111.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.4156970977783203, "kl": 0.291048139333725, "learning_rate": 2.206666666666667e-06, "loss": 0.0121, "num_tokens": 1798568.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 6029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 111.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.001039661467075348, "kl": 0.00023731961846351624, "learning_rate": 2.2061111111111114e-06, "loss": 0.0, "num_tokens": 1798812.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 111.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.020005006343126297, "kl": 0.0033932551741600037, "learning_rate": 2.2055555555555557e-06, "loss": 0.0002, "num_tokens": 1799072.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 111.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.015535361133515835, "kl": 0.0003451049415161833, "learning_rate": 2.205e-06, "loss": 0.0, "num_tokens": 1799328.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 111.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.050961706787347794, "kl": 0.244890958070755, "learning_rate": 2.2044444444444444e-06, "loss": 0.0122, "num_tokens": 1799628.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 111.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.015632743015885353, "kl": 0.0025843274779617786, "learning_rate": 2.203888888888889e-06, "loss": 0.0001, "num_tokens": 1799908.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 111.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.002741132164373994, "kl": 0.00020079016394447535, "learning_rate": 2.2033333333333336e-06, "loss": 0.0, "num_tokens": 1800128.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 111.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.15337330102920532, "kl": 0.08432025834918022, "learning_rate": 2.202777777777778e-06, "loss": 0.0044, "num_tokens": 1800409.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 111.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.04545655846595764, "kl": 0.010111047886312008, "learning_rate": 2.2022222222222227e-06, "loss": 0.0005, "num_tokens": 1800719.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6037 }, { "clip_ratio/high_max": 0.005263158120214939, "clip_ratio/high_mean": 0.005263158120214939, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005263158120214939, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 111.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 0.8340911865234375, "kl": 0.1857486516237259, "learning_rate": 2.201666666666667e-06, "loss": -0.0491, "num_tokens": 1801171.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 6038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 111.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.09523662179708481, "kl": 0.03564588166773319, "learning_rate": 2.2011111111111114e-06, "loss": 0.0018, "num_tokens": 1801454.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 111.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.6496996879577637, "kl": 0.1130510475486517, "learning_rate": 2.2005555555555558e-06, "loss": 0.0316, "num_tokens": 1801758.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 111.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01881404221057892, "kl": 0.029761552810668945, "learning_rate": 2.2e-06, "loss": 0.0015, "num_tokens": 1801974.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 111.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006149333901703358, "kl": 0.0005896240472793579, "learning_rate": 2.1994444444444445e-06, "loss": 0.0, "num_tokens": 1802186.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 111.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01992158591747284, "kl": 0.0025398588040843606, "learning_rate": 2.198888888888889e-06, "loss": 0.0001, "num_tokens": 1802508.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 111.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.16327790915966034, "kl": 0.017646205611526966, "learning_rate": 2.1983333333333336e-06, "loss": 0.001, "num_tokens": 1802780.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 111.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.019862182438373566, "kl": 0.13532210513949394, "learning_rate": 2.197777777777778e-06, "loss": 0.0068, "num_tokens": 1803088.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 111.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02087053284049034, "kl": 0.06326178647577763, "learning_rate": 2.1972222222222227e-06, "loss": 0.0031, "num_tokens": 1803407.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 111.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 1.7012500762939453, "kl": 0.25212240777909756, "learning_rate": 2.196666666666667e-06, "loss": 0.0165, "num_tokens": 1803744.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 112.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08849215507507324, "kl": 0.008362603257410228, "learning_rate": 2.1961111111111114e-06, "loss": 0.0005, "num_tokens": 1804064.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 112.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04608449712395668, "kl": 0.17156820744276047, "learning_rate": 2.195555555555556e-06, "loss": 0.0086, "num_tokens": 1804375.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 112.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.11990463733673096, "kl": 0.046005805023014545, "learning_rate": 2.195e-06, "loss": 0.0022, "num_tokens": 1804692.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 112.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.16025564074516296, "kl": 0.011379756033420563, "learning_rate": 2.1944444444444445e-06, "loss": 0.0006, "num_tokens": 1804912.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 112.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.08678747713565826, "kl": 0.007718861103057861, "learning_rate": 2.193888888888889e-06, "loss": 0.0004, "num_tokens": 1805156.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 112.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00883081927895546, "kl": 0.0032119974493980408, "learning_rate": 2.1933333333333332e-06, "loss": 0.0002, "num_tokens": 1805416.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 112.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0341251976788044, "kl": 0.0017482978582847863, "learning_rate": 2.192777777777778e-06, "loss": 0.0001, "num_tokens": 1805650.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 112.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.5243034362792969, "kl": 0.06926862895488739, "learning_rate": 2.1922222222222224e-06, "loss": 0.0035, "num_tokens": 1805866.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 112.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05626697838306427, "kl": 0.14360196143388748, "learning_rate": 2.191666666666667e-06, "loss": 0.0079, "num_tokens": 1806187.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 112.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019882565829902887, "kl": 6.0535967350006104e-05, "learning_rate": 2.1911111111111115e-06, "loss": 0.0, "num_tokens": 1806399.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.10163866728544235, "kl": 0.02598479762673378, "learning_rate": 2.190555555555556e-06, "loss": 0.0013, "num_tokens": 1806687.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 112.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.021295465528964996, "kl": 0.012859976850450039, "learning_rate": 2.19e-06, "loss": 0.0007, "num_tokens": 1807020.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 112.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.8200491070747375, "kl": 0.17906538024544716, "learning_rate": 2.1894444444444446e-06, "loss": 0.0091, "num_tokens": 1807385.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 112.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.010625987313687801, "kl": 0.008292950689792633, "learning_rate": 2.188888888888889e-06, "loss": 0.0004, "num_tokens": 1807621.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 112.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 2.767887592315674, "kl": 0.4960707724094391, "learning_rate": 2.1883333333333333e-06, "loss": 0.0537, "num_tokens": 1807944.0, "reward": 4.625, "reward_std": 2.25, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 2.25, "step": 6062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 112.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.052661895751953, "kl": 0.05778149887919426, "learning_rate": 2.1877777777777776e-06, "loss": -0.0032, "num_tokens": 1808323.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 6063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 112.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.004133060108870268, "kl": 0.00015216073370538652, "learning_rate": 2.1872222222222224e-06, "loss": 0.0, "num_tokens": 1808595.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 112.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.05648043379187584, "kl": 0.006935239769518375, "learning_rate": 2.1866666666666668e-06, "loss": 0.0003, "num_tokens": 1808883.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.185263991355896, "kl": 0.06419800035655499, "learning_rate": 2.1861111111111115e-06, "loss": 0.0032, "num_tokens": 1809173.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 112.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.020935801789164543, "kl": 0.0023669424699619412, "learning_rate": 2.185555555555556e-06, "loss": 0.0001, "num_tokens": 1809491.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 112.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06464068591594696, "kl": 0.013613096438348293, "learning_rate": 2.1850000000000003e-06, "loss": 0.0007, "num_tokens": 1809807.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 112.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.08014682680368423, "kl": 0.030493064085021615, "learning_rate": 2.1844444444444446e-06, "loss": 0.0015, "num_tokens": 1810107.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 112.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03485492989420891, "kl": 0.006461994023993611, "learning_rate": 2.183888888888889e-06, "loss": 0.0003, "num_tokens": 1810413.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 112.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014821706572547555, "kl": 0.0001199752077809535, "learning_rate": 2.1833333333333333e-06, "loss": 0.0, "num_tokens": 1810669.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 112.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.089195489883423, "kl": 0.24815510213375092, "learning_rate": 2.1827777777777777e-06, "loss": 0.0373, "num_tokens": 1810983.0, "reward": 7.0, "reward_std": 2.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 2.0, "step": 6072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 84.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 84.0, "completions/mean_terminated_length": 26.666667938232422, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 112.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.3383617401123047, "kl": 0.08651341870427132, "learning_rate": 2.1822222222222225e-06, "loss": 0.4261, "num_tokens": 1811555.0, "reward": 3.625, "reward_std": 5.202163219451904, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 5.202163219451904, "step": 6073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.127575159072876, "kl": 0.11558055132627487, "learning_rate": 2.181666666666667e-06, "loss": -0.044, "num_tokens": 1811842.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 112.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.00293197063729167, "kl": 0.001787553948815912, "learning_rate": 2.181111111111111e-06, "loss": 0.0001, "num_tokens": 1812122.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 112.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.6724700927734375, "kl": 0.04169660899788141, "learning_rate": 2.180555555555556e-06, "loss": 0.0159, "num_tokens": 1812459.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 112.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02542734704911709, "kl": 0.00864492543041706, "learning_rate": 2.1800000000000003e-06, "loss": 0.0004, "num_tokens": 1812731.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 112.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.007984815165400505, "kl": 0.0032859371276572347, "learning_rate": 2.1794444444444447e-06, "loss": 0.0002, "num_tokens": 1812995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 112.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0789913535118103, "kl": 0.02009462658315897, "learning_rate": 2.178888888888889e-06, "loss": 0.001, "num_tokens": 1813346.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 112.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.2764883041381836, "kl": 0.05030344473198056, "learning_rate": 2.1783333333333334e-06, "loss": 0.0222, "num_tokens": 1813680.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 112.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02512883022427559, "kl": 0.04920154204592109, "learning_rate": 2.1777777777777777e-06, "loss": 0.0025, "num_tokens": 1813956.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 112.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02674880065023899, "kl": 0.011071652173995972, "learning_rate": 2.1772222222222225e-06, "loss": 0.0006, "num_tokens": 1814216.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 112.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.021456070244312286, "kl": 0.009049532003700733, "learning_rate": 2.176666666666667e-06, "loss": 0.0005, "num_tokens": 1814528.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 112.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.10913977771997452, "kl": 0.029417719691991806, "learning_rate": 2.1761111111111112e-06, "loss": 0.0016, "num_tokens": 1814869.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 112.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.3510313332080841, "kl": 0.12328135967254639, "learning_rate": 2.1755555555555556e-06, "loss": 0.0055, "num_tokens": 1815289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 112.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 3.8163766860961914, "kl": 0.1504650991410017, "learning_rate": 2.1750000000000004e-06, "loss": -0.0132, "num_tokens": 1815560.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 112.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.005437879357486963, "kl": 0.0003678947687149048, "learning_rate": 2.1744444444444447e-06, "loss": 0.0, "num_tokens": 1815772.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 112.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.143961191177368, "kl": 0.2059275507926941, "learning_rate": 2.173888888888889e-06, "loss": 0.1161, "num_tokens": 1816139.0, "reward": 3.375, "reward_std": 3.3008837699890137, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 3.3008837699890137, "step": 6088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 112.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.11010470986366272, "kl": 0.012802592013031244, "learning_rate": 2.1733333333333334e-06, "loss": 0.0007, "num_tokens": 1816407.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 112.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 0.5388325452804565, "kl": 0.1784424725919962, "learning_rate": 2.1727777777777778e-06, "loss": -0.0024, "num_tokens": 1816861.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 6090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.07453395426273346, "kl": 0.009805553127080202, "learning_rate": 2.1722222222222226e-06, "loss": 0.0005, "num_tokens": 1817161.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 112.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.01216393057256937, "kl": 0.28001829981803894, "learning_rate": 2.171666666666667e-06, "loss": 0.014, "num_tokens": 1817449.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 112.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 3.32770037651062, "kl": 0.4544735610470525, "learning_rate": 2.1711111111111113e-06, "loss": 0.0321, "num_tokens": 1817676.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 112.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009272870607674122, "kl": 0.0008522808493580669, "learning_rate": 2.1705555555555556e-06, "loss": 0.0, "num_tokens": 1817936.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 112.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05767339840531349, "kl": 0.22652824223041534, "learning_rate": 2.17e-06, "loss": 0.0113, "num_tokens": 1818239.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.830675721168518, "kl": 0.20501817762851715, "learning_rate": 2.1694444444444448e-06, "loss": 0.0145, "num_tokens": 1818523.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 112.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.029846949502825737, "kl": 0.006603237998206168, "learning_rate": 2.168888888888889e-06, "loss": 0.0003, "num_tokens": 1818837.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 112.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.09349546581506729, "kl": 0.0076245516538619995, "learning_rate": 2.1683333333333335e-06, "loss": 0.0004, "num_tokens": 1819047.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 112.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.042213354259729385, "kl": 0.016038633417338133, "learning_rate": 2.1677777777777782e-06, "loss": 0.0008, "num_tokens": 1819343.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 112.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01657204143702984, "kl": 0.0014169029891490936, "learning_rate": 2.1672222222222226e-06, "loss": 0.0001, "num_tokens": 1819601.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 112.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.283689260482788, "kl": 0.12908913288265467, "learning_rate": 2.166666666666667e-06, "loss": 0.0816, "num_tokens": 1819889.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017605633474886417, "clip_ratio/low_min": 0.0017605633474886417, "clip_ratio/region_mean": 0.0017605633474886417, "completion_length": 82.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 82.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 113.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.6196420192718506, "kl": 0.12353427708148956, "learning_rate": 2.1661111111111113e-06, "loss": 0.4054, "num_tokens": 1820437.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 6102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 113.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.050891418009996414, "kl": 0.0014057705411687493, "learning_rate": 2.1655555555555557e-06, "loss": 0.0001, "num_tokens": 1820653.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 113.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.033340346068143845, "kl": 0.017981277778744698, "learning_rate": 2.165e-06, "loss": 0.0009, "num_tokens": 1820987.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 113.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.18413469195365906, "kl": 0.03118019551038742, "learning_rate": 2.1644444444444444e-06, "loss": 0.0016, "num_tokens": 1821275.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 113.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.028812585398554802, "kl": 0.00842451537027955, "learning_rate": 2.163888888888889e-06, "loss": 0.0004, "num_tokens": 1821547.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 55.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 113.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04702376201748848, "kl": 0.04576274752616882, "learning_rate": 2.1633333333333335e-06, "loss": 0.0023, "num_tokens": 1822047.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 113.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05289900302886963, "kl": 0.008646835340186954, "learning_rate": 2.1627777777777783e-06, "loss": 0.0005, "num_tokens": 1822319.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 113.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.038547784090042114, "kl": 0.022501381783513352, "learning_rate": 2.1622222222222226e-06, "loss": 0.0012, "num_tokens": 1822617.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 113.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.028225108981132507, "kl": 0.002023942768573761, "learning_rate": 2.161666666666667e-06, "loss": 0.0001, "num_tokens": 1822827.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 113.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.020313026383519173, "kl": 0.0008874102204572409, "learning_rate": 2.1611111111111114e-06, "loss": 0.0, "num_tokens": 1823061.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 113.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 3.7371909618377686, "kl": 0.06771684810519218, "learning_rate": 2.1605555555555557e-06, "loss": 0.1713, "num_tokens": 1823419.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 6112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 113.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 7.006613731384277, "kl": 0.07978888042271137, "learning_rate": 2.16e-06, "loss": 0.0365, "num_tokens": 1823711.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 113.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.16663555800914764, "kl": 0.07949415594339371, "learning_rate": 2.1594444444444444e-06, "loss": 0.0039, "num_tokens": 1824030.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 113.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.158132314682007, "kl": 0.28016945719718933, "learning_rate": 2.1588888888888888e-06, "loss": 0.0251, "num_tokens": 1824321.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 113.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.09442679584026337, "kl": 0.16323138773441315, "learning_rate": 2.1583333333333336e-06, "loss": 0.0081, "num_tokens": 1824634.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 113.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.041469134390354156, "kl": 0.011115437373518944, "learning_rate": 2.157777777777778e-06, "loss": 0.0006, "num_tokens": 1824952.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 113.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 4.43546724319458, "kl": 0.05777533911168575, "learning_rate": 2.1572222222222227e-06, "loss": -0.0074, "num_tokens": 1825243.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 113.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0073332469910383224, "kl": 7.040798664093018e-05, "learning_rate": 2.156666666666667e-06, "loss": 0.0, "num_tokens": 1825455.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 113.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.012551339343190193, "kl": 0.003149431198835373, "learning_rate": 2.1561111111111114e-06, "loss": 0.0002, "num_tokens": 1825715.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 113.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.6059343814849854, "kl": 0.10288797691464424, "learning_rate": 2.1555555555555558e-06, "loss": 0.0051, "num_tokens": 1825991.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 113.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 13.204689979553223, "kl": 1.6243532951921225, "learning_rate": 2.155e-06, "loss": 0.1756, "num_tokens": 1826276.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 113.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.10166820883750916, "kl": 0.01871702226344496, "learning_rate": 2.1544444444444445e-06, "loss": 0.001, "num_tokens": 1826532.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 113.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 10.827630996704102, "kl": 0.12185200303792953, "learning_rate": 2.153888888888889e-06, "loss": 0.0523, "num_tokens": 1826769.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 113.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.001974203623831272, "kl": 0.0016993965255096555, "learning_rate": 2.153333333333333e-06, "loss": 0.0001, "num_tokens": 1827049.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 113.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 1.4478684663772583, "kl": 0.02028178024920635, "learning_rate": 2.152777777777778e-06, "loss": -0.0128, "num_tokens": 1827338.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 113.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036866224836558104, "kl": 0.0001261120087292511, "learning_rate": 2.1522222222222223e-06, "loss": 0.0, "num_tokens": 1827606.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 113.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03261922299861908, "kl": 0.006584532093256712, "learning_rate": 2.151666666666667e-06, "loss": 0.0003, "num_tokens": 1827905.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 113.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.01683327741920948, "kl": 0.002235178602859378, "learning_rate": 2.1511111111111115e-06, "loss": 0.0001, "num_tokens": 1828224.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 113.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.014461666345596313, "kl": 0.0002311646967427805, "learning_rate": 2.150555555555556e-06, "loss": 0.0, "num_tokens": 1828480.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 113.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.2286970615386963, "kl": 0.13908874988555908, "learning_rate": 2.15e-06, "loss": 0.2872, "num_tokens": 1828876.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 113.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.16649535298347473, "kl": 0.1676137074828148, "learning_rate": 2.1494444444444445e-06, "loss": 0.0084, "num_tokens": 1829168.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 113.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.035835642367601395, "kl": 0.004651409282814711, "learning_rate": 2.148888888888889e-06, "loss": 0.0002, "num_tokens": 1829480.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 113.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.024530988186597824, "kl": 0.01192379737040028, "learning_rate": 2.1483333333333332e-06, "loss": 0.0007, "num_tokens": 1829754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 113.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.9350672960281372, "kl": 0.4306074529886246, "learning_rate": 2.147777777777778e-06, "loss": 0.0473, "num_tokens": 1830017.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 6135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 113.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.13648706674575806, "kl": 0.0764613188803196, "learning_rate": 2.1472222222222224e-06, "loss": 0.0038, "num_tokens": 1830316.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 113.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04307585209608078, "kl": 0.008385344874113798, "learning_rate": 2.1466666666666667e-06, "loss": 0.0004, "num_tokens": 1830614.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 113.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03987269103527069, "kl": 0.25702646374702454, "learning_rate": 2.1461111111111115e-06, "loss": 0.0128, "num_tokens": 1830912.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 113.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 2.4255263805389404, "kl": 0.013406151905655861, "learning_rate": 2.145555555555556e-06, "loss": -0.0016, "num_tokens": 1831244.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 113.70370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.895092010498047, "kl": 0.13369911909103394, "learning_rate": 2.1450000000000002e-06, "loss": -0.2357, "num_tokens": 1831634.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 6140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 113.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 1.0395591743872501e-05, "kl": 5.7891011238098145e-06, "learning_rate": 2.1444444444444446e-06, "loss": 0.0, "num_tokens": 1831854.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 113.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.8534014225006104, "kl": 0.09217309020459652, "learning_rate": 2.143888888888889e-06, "loss": 0.0582, "num_tokens": 1832153.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 113.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.014884160831570625, "kl": 0.02859894186258316, "learning_rate": 2.1433333333333333e-06, "loss": 0.0014, "num_tokens": 1832369.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 113.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.704893112182617, "kl": 0.07611641474068165, "learning_rate": 2.142777777777778e-06, "loss": 0.0355, "num_tokens": 1832714.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 113.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 2.3947129249572754, "kl": 0.23444068431854248, "learning_rate": 2.1422222222222224e-06, "loss": 0.0433, "num_tokens": 1832959.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 113.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.017887303605675697, "kl": 0.012296376749873161, "learning_rate": 2.1416666666666668e-06, "loss": 0.0006, "num_tokens": 1833271.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 113.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.010002432391047478, "kl": 0.07002153992652893, "learning_rate": 2.141111111111111e-06, "loss": 0.0035, "num_tokens": 1833716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 113.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08037087321281433, "kl": 0.009971158346161246, "learning_rate": 2.140555555555556e-06, "loss": 0.0004, "num_tokens": 1833934.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 113.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07061821967363358, "kl": 0.026679322123527527, "learning_rate": 2.1400000000000003e-06, "loss": 0.0013, "num_tokens": 1834275.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 113.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.8225958347320557, "kl": 0.0656715645454824, "learning_rate": 2.1394444444444446e-06, "loss": 0.1403, "num_tokens": 1834561.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 113.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.1618075370788574, "kl": 0.17496081441640854, "learning_rate": 2.138888888888889e-06, "loss": 0.0011, "num_tokens": 1834917.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 113.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.024663705378770828, "kl": 0.016083345282822847, "learning_rate": 2.1383333333333333e-06, "loss": 0.0008, "num_tokens": 1835251.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 113.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04985775426030159, "kl": 0.2501007616519928, "learning_rate": 2.137777777777778e-06, "loss": 0.0126, "num_tokens": 1835536.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 113.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04155077040195465, "kl": 0.19860661891289055, "learning_rate": 2.1372222222222225e-06, "loss": 0.0092, "num_tokens": 1835861.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 113.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.5671331882476807, "kl": 0.2499275803565979, "learning_rate": 2.136666666666667e-06, "loss": 0.0145, "num_tokens": 1836191.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 114.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.02440754324197769, "kl": 0.0012932829558849335, "learning_rate": 2.136111111111111e-06, "loss": 0.0001, "num_tokens": 1836443.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 114.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02165374904870987, "kl": 0.00024612993001937866, "learning_rate": 2.1355555555555555e-06, "loss": 0.0, "num_tokens": 1836655.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 114.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02975427359342575, "kl": 0.12990756472572684, "learning_rate": 2.1350000000000003e-06, "loss": 0.0069, "num_tokens": 1836984.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02270490676164627, "kl": 0.0031796805560588837, "learning_rate": 2.1344444444444447e-06, "loss": 0.0002, "num_tokens": 1837258.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 114.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.029155215248465538, "kl": 0.2763414680957794, "learning_rate": 2.133888888888889e-06, "loss": 0.0138, "num_tokens": 1837546.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 114.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0473967120051384, "kl": 0.020719011314213276, "learning_rate": 2.133333333333334e-06, "loss": 0.001, "num_tokens": 1837896.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.038214121013879776, "kl": 0.013288504269439727, "learning_rate": 2.132777777777778e-06, "loss": 0.0007, "num_tokens": 1838182.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 114.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019931381102651358, "kl": 7.161498069763184e-05, "learning_rate": 2.1322222222222225e-06, "loss": 0.0, "num_tokens": 1838402.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 114.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 7.0834197998046875, "kl": 0.025553373619914055, "learning_rate": 2.131666666666667e-06, "loss": 0.1396, "num_tokens": 1838702.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 114.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.5772111415863037, "kl": 0.03709984943270683, "learning_rate": 2.1311111111111112e-06, "loss": -0.0407, "num_tokens": 1839103.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 114.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.10489331930875778, "kl": 0.022829919820651412, "learning_rate": 2.1305555555555556e-06, "loss": 0.0011, "num_tokens": 1839386.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 114.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.48678353428840637, "kl": 0.18745502829551697, "learning_rate": 2.13e-06, "loss": 0.0095, "num_tokens": 1839747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 114.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05848684161901474, "kl": 0.015806270763278008, "learning_rate": 2.1294444444444447e-06, "loss": 0.0008, "num_tokens": 1840045.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 114.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 5.524867447093129e-05, "kl": 6.623566150665283e-06, "learning_rate": 2.128888888888889e-06, "loss": 0.0, "num_tokens": 1840265.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 114.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 7.43329381942749, "kl": 0.04148198012262583, "learning_rate": 2.128333333333334e-06, "loss": 0.187, "num_tokens": 1840535.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 6170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 114.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.2223164290189743, "kl": 0.04026585631072521, "learning_rate": 2.127777777777778e-06, "loss": 0.0023, "num_tokens": 1840809.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 114.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 2.1885359287261963, "kl": 0.1020142063498497, "learning_rate": 2.1272222222222226e-06, "loss": -0.0319, "num_tokens": 1841123.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 114.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.021078990772366524, "kl": 0.009551001712679863, "learning_rate": 2.126666666666667e-06, "loss": 0.0005, "num_tokens": 1841435.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 114.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.7166193723678589, "kl": 0.13280972093343735, "learning_rate": 2.1261111111111113e-06, "loss": 0.0064, "num_tokens": 1841680.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05884962156414986, "kl": 0.006969264941290021, "learning_rate": 2.1255555555555556e-06, "loss": 0.0003, "num_tokens": 1841976.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 114.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04612640291452408, "kl": 0.24723601341247559, "learning_rate": 2.125e-06, "loss": 0.0123, "num_tokens": 1842276.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 114.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03924407809972763, "kl": 0.014103528577834368, "learning_rate": 2.1244444444444443e-06, "loss": 0.0007, "num_tokens": 1842549.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 114.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.041142597794532776, "kl": 0.006588342483155429, "learning_rate": 2.123888888888889e-06, "loss": 0.0003, "num_tokens": 1842843.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 114.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.12202104926109314, "kl": 0.026150868274271488, "learning_rate": 2.1233333333333335e-06, "loss": 0.0013, "num_tokens": 1843163.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.14485938847064972, "kl": 0.029192526824772358, "learning_rate": 2.1227777777777783e-06, "loss": 0.0014, "num_tokens": 1843437.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.832287073135376, "kl": 0.19112340733408928, "learning_rate": 2.1222222222222226e-06, "loss": 0.0968, "num_tokens": 1843722.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 114.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.9722588062286377, "kl": 0.08412356302142143, "learning_rate": 2.121666666666667e-06, "loss": 0.2069, "num_tokens": 1844130.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 114.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.0585877895355225, "kl": 0.00022876859293319285, "learning_rate": 2.1211111111111113e-06, "loss": 0.0482, "num_tokens": 1844398.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 114.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.504530191421509, "kl": 0.0356330550275743, "learning_rate": 2.1205555555555557e-06, "loss": 0.004, "num_tokens": 1844728.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 114.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03549295663833618, "kl": 0.004016675055027008, "learning_rate": 2.12e-06, "loss": 0.0002, "num_tokens": 1844988.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 114.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.010701787658035755, "kl": 0.0019233025377616286, "learning_rate": 2.1194444444444444e-06, "loss": 0.0001, "num_tokens": 1845310.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 114.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.4138386249542236, "kl": 0.21466002613306046, "learning_rate": 2.1188888888888887e-06, "loss": -0.0995, "num_tokens": 1845742.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 6187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 114.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.09328605979681015, "kl": 0.05844890233129263, "learning_rate": 2.1183333333333335e-06, "loss": 0.0029, "num_tokens": 1846014.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 114.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07074805349111557, "kl": 0.0354470768943429, "learning_rate": 2.117777777777778e-06, "loss": 0.0018, "num_tokens": 1846366.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 114.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.5716776251792908, "kl": 0.10378778353333473, "learning_rate": 2.1172222222222227e-06, "loss": 0.0054, "num_tokens": 1846692.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 114.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00467537809163332, "kl": 0.01018562912940979, "learning_rate": 2.116666666666667e-06, "loss": 0.0005, "num_tokens": 1846928.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 114.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.05650400370359421, "kl": 0.002824599388986826, "learning_rate": 2.1161111111111114e-06, "loss": 0.0001, "num_tokens": 1847184.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 114.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03519514575600624, "kl": 0.016815371811389923, "learning_rate": 2.1155555555555557e-06, "loss": 0.0008, "num_tokens": 1847521.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 114.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.14464792609214783, "kl": 0.06504988670349121, "learning_rate": 2.115e-06, "loss": 0.0033, "num_tokens": 1847977.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 114.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.5485353469848633, "kl": 0.14055982697755098, "learning_rate": 2.1144444444444444e-06, "loss": 0.0322, "num_tokens": 1848277.0, "reward": 5.0, "reward_std": 5.0, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 5.0, "step": 6195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 114.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.19939638674259186, "kl": 0.038467807695269585, "learning_rate": 2.113888888888889e-06, "loss": 0.002, "num_tokens": 1848610.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 114.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.23498916625976562, "kl": 0.033667486684862524, "learning_rate": 2.1133333333333336e-06, "loss": 0.002, "num_tokens": 1848884.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 114.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.1272200495004654, "kl": 0.050681451335549355, "learning_rate": 2.112777777777778e-06, "loss": 0.0025, "num_tokens": 1849175.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 114.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01798631064593792, "kl": 0.0013722286676056683, "learning_rate": 2.1122222222222223e-06, "loss": 0.0001, "num_tokens": 1849489.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 114.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.024359581992030144, "kl": 0.00212163629475981, "learning_rate": 2.111666666666667e-06, "loss": 0.0001, "num_tokens": 1849749.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 114.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 1.2152575254440308, "kl": 0.26867593824863434, "learning_rate": 2.1111111111111114e-06, "loss": 0.0141, "num_tokens": 1850038.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 114.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01963965967297554, "kl": 0.0006205091485753655, "learning_rate": 2.1105555555555558e-06, "loss": 0.0, "num_tokens": 1850270.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 114.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.3028826117515564, "kl": 0.20876678824424744, "learning_rate": 2.11e-06, "loss": 0.0107, "num_tokens": 1850585.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 114.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.018353184685111046, "kl": 0.005074805696494877, "learning_rate": 2.1094444444444445e-06, "loss": 0.0003, "num_tokens": 1850849.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 114.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.01984129473567009, "kl": 0.028564006090164185, "learning_rate": 2.108888888888889e-06, "loss": 0.0014, "num_tokens": 1851065.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 114.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0267886184155941, "kl": 0.003085225820541382, "learning_rate": 2.1083333333333336e-06, "loss": 0.0002, "num_tokens": 1851273.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 114.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.022300630807876587, "kl": 0.0004849293181905523, "learning_rate": 2.107777777777778e-06, "loss": 0.0, "num_tokens": 1851530.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 114.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 1.064625859260559, "kl": 0.2958908900618553, "learning_rate": 2.1072222222222223e-06, "loss": 0.0151, "num_tokens": 1851869.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 114.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.1493866443634033, "kl": 0.18813808262348175, "learning_rate": 2.1066666666666667e-06, "loss": 0.0014, "num_tokens": 1852223.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 115.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067878179252147675, "kl": 0.00067177414894104, "learning_rate": 2.1061111111111115e-06, "loss": 0.0, "num_tokens": 1852435.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 115.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.09506691247224808, "kl": 0.025525257922708988, "learning_rate": 2.105555555555556e-06, "loss": 0.0013, "num_tokens": 1852713.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6211 }, { "clip_ratio/high_max": 0.006097560748457909, "clip_ratio/high_mean": 0.006097560748457909, "clip_ratio/low_mean": 0.005494505632668734, "clip_ratio/low_min": 0.005494505632668734, "clip_ratio/region_mean": 0.011592066381126642, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 115.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.97737717628479, "kl": 0.07139049842953682, "learning_rate": 2.105e-06, "loss": 0.1557, "num_tokens": 1853110.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 115.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.5091521739959717, "kl": 0.46007873117923737, "learning_rate": 2.1044444444444445e-06, "loss": -0.0489, "num_tokens": 1853457.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 115.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.23878154158592224, "kl": 0.1584746465086937, "learning_rate": 2.103888888888889e-06, "loss": 0.0079, "num_tokens": 1853805.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 115.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 3.3895304203033447, "kl": 0.11532693356275558, "learning_rate": 2.1033333333333337e-06, "loss": 0.2884, "num_tokens": 1854131.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 115.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076521714217960835, "kl": 0.000981319259153679, "learning_rate": 2.102777777777778e-06, "loss": 0.0, "num_tokens": 1854443.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 115.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.007051572669297457, "kl": 0.0009278059296775609, "learning_rate": 2.1022222222222224e-06, "loss": 0.0, "num_tokens": 1854703.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 115.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.5012973546981812, "kl": 0.30463026463985443, "learning_rate": 2.1016666666666667e-06, "loss": -0.0317, "num_tokens": 1855057.0, "reward": 6.875, "reward_std": 1.6007810831069946, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 1.6007810831069946, "step": 6218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 115.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.031357865780591965, "kl": 0.0017311499104835093, "learning_rate": 2.101111111111111e-06, "loss": 0.0001, "num_tokens": 1855313.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 115.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.017504926770925522, "kl": 0.012553073465824127, "learning_rate": 2.100555555555556e-06, "loss": 0.0006, "num_tokens": 1855625.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 115.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.01802029460668564, "kl": 0.0023014716571196914, "learning_rate": 2.1000000000000002e-06, "loss": 0.0001, "num_tokens": 1855944.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 115.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.011903711594641209, "kl": 0.00022271274792728946, "learning_rate": 2.0994444444444446e-06, "loss": 0.0, "num_tokens": 1856200.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 115.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.006694296840578318, "kl": 0.0003984086215496063, "learning_rate": 2.0988888888888894e-06, "loss": 0.0, "num_tokens": 1856444.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 115.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 1.1288267374038696, "kl": 0.21828464418649673, "learning_rate": 2.0983333333333337e-06, "loss": -0.0372, "num_tokens": 1856787.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 115.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.5936522483825684, "kl": 0.021667358581908047, "learning_rate": 2.097777777777778e-06, "loss": -0.0151, "num_tokens": 1857056.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 6225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 115.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 0.9736804962158203, "kl": 0.13971627689898014, "learning_rate": 2.0972222222222224e-06, "loss": -0.1934, "num_tokens": 1857428.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 115.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.026000099256634712, "kl": 0.045444661751389503, "learning_rate": 2.0966666666666668e-06, "loss": 0.0023, "num_tokens": 1857898.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 115.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.07722605764865875, "kl": 0.021902869921177626, "learning_rate": 2.096111111111111e-06, "loss": 0.0011, "num_tokens": 1858226.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 115.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05660555884242058, "kl": 0.010604064911603928, "learning_rate": 2.0955555555555555e-06, "loss": 0.0005, "num_tokens": 1858498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 115.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.026310674846172333, "kl": 0.00583695201203227, "learning_rate": 2.0950000000000003e-06, "loss": 0.0003, "num_tokens": 1858788.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 115.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.743093967437744, "kl": 0.25420641899108887, "learning_rate": 2.0944444444444446e-06, "loss": 0.0733, "num_tokens": 1859120.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 115.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07478886097669601, "kl": 0.048848215490579605, "learning_rate": 2.093888888888889e-06, "loss": 0.0024, "num_tokens": 1859396.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 115.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05656025931239128, "kl": 0.01245266618207097, "learning_rate": 2.0933333333333338e-06, "loss": 0.0006, "num_tokens": 1859696.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 115.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.010207122191786766, "kl": 0.0006819168629590422, "learning_rate": 2.092777777777778e-06, "loss": 0.0, "num_tokens": 1859912.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 115.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02823273465037346, "kl": 0.014853685162961483, "learning_rate": 2.0922222222222225e-06, "loss": 0.0007, "num_tokens": 1860207.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 115.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09800989925861359, "kl": 0.08881719782948494, "learning_rate": 2.091666666666667e-06, "loss": 0.0043, "num_tokens": 1860507.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 115.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02497619204223156, "kl": 0.015346252359449863, "learning_rate": 2.091111111111111e-06, "loss": 0.0008, "num_tokens": 1860773.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 115.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08878032118082047, "kl": 0.23614290356636047, "learning_rate": 2.0905555555555555e-06, "loss": 0.0118, "num_tokens": 1861075.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 115.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09772758930921555, "kl": 0.06559448689222336, "learning_rate": 2.09e-06, "loss": 0.0032, "num_tokens": 1861449.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 115.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.14731749892234802, "kl": 0.024723736569285393, "learning_rate": 2.0894444444444447e-06, "loss": 0.0012, "num_tokens": 1861787.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 115.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.011515884660184383, "kl": 0.0006388468318618834, "learning_rate": 2.088888888888889e-06, "loss": 0.0, "num_tokens": 1862023.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 115.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05641988664865494, "kl": 0.16969963163137436, "learning_rate": 2.088333333333334e-06, "loss": 0.0085, "num_tokens": 1862333.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 115.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.055848684161901474, "kl": 0.01853666454553604, "learning_rate": 2.087777777777778e-06, "loss": 0.0009, "num_tokens": 1862594.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 115.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.028466783463954926, "kl": 0.0032887131674215198, "learning_rate": 2.0872222222222225e-06, "loss": 0.0002, "num_tokens": 1862878.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 115.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.019997116178274155, "kl": 0.02846953272819519, "learning_rate": 2.086666666666667e-06, "loss": 0.0014, "num_tokens": 1863094.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 115.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.036648258566856384, "kl": 0.010794499423354864, "learning_rate": 2.0861111111111112e-06, "loss": 0.0005, "num_tokens": 1863412.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 115.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.06992743164300919, "kl": 0.0068070676643401384, "learning_rate": 2.0855555555555556e-06, "loss": 0.0004, "num_tokens": 1863680.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 115.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.015659822151064873, "kl": 0.005870152032002807, "learning_rate": 2.085e-06, "loss": 0.0003, "num_tokens": 1863964.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 115.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.153276726603508, "kl": 0.03809254802763462, "learning_rate": 2.0844444444444443e-06, "loss": 0.0024, "num_tokens": 1864278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 115.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.020477956160902977, "kl": 0.005009764921851456, "learning_rate": 2.083888888888889e-06, "loss": 0.0003, "num_tokens": 1864565.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 115.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.050591498613357544, "kl": 0.004978355020284653, "learning_rate": 2.0833333333333334e-06, "loss": 0.0002, "num_tokens": 1864825.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 115.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04683661460876465, "kl": 0.2531895637512207, "learning_rate": 2.0827777777777782e-06, "loss": 0.0128, "num_tokens": 1865110.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 115.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.005169231910258532, "kl": 0.010063804686069489, "learning_rate": 2.0822222222222226e-06, "loss": 0.0005, "num_tokens": 1865346.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 115.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.10124361515045166, "kl": 0.09219424054026604, "learning_rate": 2.081666666666667e-06, "loss": 0.0046, "num_tokens": 1865766.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 115.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.10143721848726273, "kl": 0.10244003497064114, "learning_rate": 2.0811111111111113e-06, "loss": 0.0055, "num_tokens": 1866095.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 115.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10411583632230759, "kl": 0.03475111536681652, "learning_rate": 2.0805555555555556e-06, "loss": 0.0017, "num_tokens": 1866455.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 115.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 1.0585872587398626e-05, "kl": 5.967915058135986e-06, "learning_rate": 2.08e-06, "loss": 0.0, "num_tokens": 1866675.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 115.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 7.671314239501953, "kl": 0.07774632796645164, "learning_rate": 2.0794444444444443e-06, "loss": 0.1, "num_tokens": 1866944.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 6258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 115.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.18662318587303162, "kl": 0.045118626207113266, "learning_rate": 2.078888888888889e-06, "loss": 0.0023, "num_tokens": 1867228.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 115.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.058833252638578415, "kl": 0.0047527397982776165, "learning_rate": 2.0783333333333335e-06, "loss": 0.0003, "num_tokens": 1867449.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 115.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07003195583820343, "kl": 0.0028115957975387573, "learning_rate": 2.077777777777778e-06, "loss": 0.0001, "num_tokens": 1867653.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 115.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.002623763168230653, "kl": 9.787827730178833e-05, "learning_rate": 2.0772222222222226e-06, "loss": 0.0, "num_tokens": 1867865.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 115.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.8695399761199951, "kl": 0.4132527820765972, "learning_rate": 2.076666666666667e-06, "loss": 0.1152, "num_tokens": 1868154.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 116.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08134671300649643, "kl": 0.02297285944223404, "learning_rate": 2.0761111111111113e-06, "loss": 0.0012, "num_tokens": 1868443.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 116.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.042093951255083084, "kl": 0.001060055335983634, "learning_rate": 2.0755555555555557e-06, "loss": 0.0001, "num_tokens": 1868700.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 116.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09759671986103058, "kl": 0.020619112066924572, "learning_rate": 2.075e-06, "loss": 0.001, "num_tokens": 1869006.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 116.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.3088579177856445, "kl": 0.2639554962515831, "learning_rate": 2.0744444444444444e-06, "loss": 0.1347, "num_tokens": 1869360.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 116.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.6709389686584473, "kl": 0.1765725314617157, "learning_rate": 2.073888888888889e-06, "loss": -0.0385, "num_tokens": 1869710.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 116.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.2044904232025146, "kl": 0.04789942689239979, "learning_rate": 2.0733333333333335e-06, "loss": 0.007, "num_tokens": 1870000.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 6269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.12987981736660004, "kl": 0.03345471387729049, "learning_rate": 2.072777777777778e-06, "loss": 0.0018, "num_tokens": 1870300.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 116.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015785368159413338, "kl": 0.0017009845469146967, "learning_rate": 2.0722222222222222e-06, "loss": 0.0001, "num_tokens": 1870580.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 116.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04360513389110565, "kl": 0.25811767578125, "learning_rate": 2.071666666666667e-06, "loss": 0.0129, "num_tokens": 1870878.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 116.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.005032096523791552, "kl": 0.010097846388816833, "learning_rate": 2.0711111111111114e-06, "loss": 0.0005, "num_tokens": 1871114.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 116.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.022258495911955833, "kl": 0.01305572222918272, "learning_rate": 2.0705555555555557e-06, "loss": 0.0007, "num_tokens": 1871374.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 116.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.028771953657269478, "kl": 0.0036584995687007904, "learning_rate": 2.07e-06, "loss": 0.0002, "num_tokens": 1871634.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 116.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05105195939540863, "kl": 0.0576926376670599, "learning_rate": 2.0694444444444444e-06, "loss": 0.0029, "num_tokens": 1871927.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.2666839063167572, "kl": 0.06713864207267761, "learning_rate": 2.0688888888888892e-06, "loss": 0.0034, "num_tokens": 1872199.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 116.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.17966657876968384, "kl": 0.1009577065706253, "learning_rate": 2.0683333333333336e-06, "loss": 0.005, "num_tokens": 1872608.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.026716604828834534, "kl": 0.009815847035497427, "learning_rate": 2.067777777777778e-06, "loss": 0.0005, "num_tokens": 1872881.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 116.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0686362087726593, "kl": 0.004489358747377992, "learning_rate": 2.0672222222222223e-06, "loss": 0.0002, "num_tokens": 1873102.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 116.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.742727518081665, "kl": 0.17118310183286667, "learning_rate": 2.0666666666666666e-06, "loss": 0.0927, "num_tokens": 1873419.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 116.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 2.832313060760498, "kl": 0.33602166175842285, "learning_rate": 2.0661111111111114e-06, "loss": 0.0188, "num_tokens": 1873813.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6282 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 116.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.9136745929718018, "kl": 0.08286305330693722, "learning_rate": 2.0655555555555558e-06, "loss": 0.0063, "num_tokens": 1874108.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 116.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.015258959494531155, "kl": 0.04507043771445751, "learning_rate": 2.065e-06, "loss": 0.0022, "num_tokens": 1874576.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 116.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03533647954463959, "kl": 0.27490437030792236, "learning_rate": 2.064444444444445e-06, "loss": 0.0137, "num_tokens": 1874864.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 116.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07099699974060059, "kl": 0.014729758258908987, "learning_rate": 2.0638888888888893e-06, "loss": 0.0007, "num_tokens": 1875160.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 116.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.1758902072906494, "kl": 0.05702688731253147, "learning_rate": 2.0633333333333336e-06, "loss": -0.0365, "num_tokens": 1875508.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 116.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.007042685989290476, "kl": 0.00039422884583473206, "learning_rate": 2.062777777777778e-06, "loss": 0.0, "num_tokens": 1875752.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 116.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.4976694583892822, "kl": 0.025913220830261707, "learning_rate": 2.0622222222222223e-06, "loss": 0.0511, "num_tokens": 1876109.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 116.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.18947839736938477, "kl": 0.03179089445620775, "learning_rate": 2.0616666666666667e-06, "loss": 0.0016, "num_tokens": 1876441.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 116.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02887842431664467, "kl": 0.0031678073573857546, "learning_rate": 2.061111111111111e-06, "loss": 0.0002, "num_tokens": 1876762.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 116.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.2821767032146454, "kl": 0.08369030430912971, "learning_rate": 2.060555555555556e-06, "loss": 0.0042, "num_tokens": 1877130.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08565755188465118, "kl": 0.02049973257817328, "learning_rate": 2.06e-06, "loss": 0.0011, "num_tokens": 1877400.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 116.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.04043430835008621, "kl": 0.003047777689062059, "learning_rate": 2.0594444444444445e-06, "loss": 0.0001, "num_tokens": 1877654.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 116.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.007375488523393869, "kl": 0.0010565519332885742, "learning_rate": 2.0588888888888893e-06, "loss": 0.0001, "num_tokens": 1877866.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 116.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.03139173239469528, "kl": 0.0017637120909057558, "learning_rate": 2.0583333333333337e-06, "loss": 0.0001, "num_tokens": 1878154.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 116.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.38075655698776245, "kl": 0.07531918212771416, "learning_rate": 2.057777777777778e-06, "loss": 0.0042, "num_tokens": 1878427.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 88.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 88.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 116.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.40717351436615, "kl": 0.23050304129719734, "learning_rate": 2.0572222222222224e-06, "loss": 0.4811, "num_tokens": 1879003.0, "reward": 2.049999952316284, "reward_std": 4.180510520935059, "rewards/reward_combined/mean": 2.049999952316284, "rewards/reward_combined/std": 4.1805100440979, "step": 6298 }, { "clip_ratio/high_max": 0.011111111380159855, "clip_ratio/high_mean": 0.011111111380159855, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 116.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.3738017082214355, "kl": 0.06858956813812256, "learning_rate": 2.0566666666666667e-06, "loss": -0.0738, "num_tokens": 1879312.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 116.66666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.4445629119873047, "kl": 0.08344636857509613, "learning_rate": 2.056111111111111e-06, "loss": -0.1153, "num_tokens": 1879701.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 116.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.026725752279162407, "kl": 0.0020691233221441507, "learning_rate": 2.0555555555555555e-06, "loss": 0.0001, "num_tokens": 1879963.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 116.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.02836594730615616, "kl": 0.002704039216041565, "learning_rate": 2.0550000000000002e-06, "loss": 0.0001, "num_tokens": 1880173.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 116.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.0364298820495605, "kl": 1.1924767270684242, "learning_rate": 2.0544444444444446e-06, "loss": 0.0866, "num_tokens": 1880508.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 116.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058085257187485695, "kl": 0.0004229918122291565, "learning_rate": 2.0538888888888894e-06, "loss": 0.0, "num_tokens": 1880720.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 116.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.06621057540178299, "kl": 0.01415944891050458, "learning_rate": 2.0533333333333337e-06, "loss": 0.0007, "num_tokens": 1881007.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 116.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 1.0208081221207976e-05, "kl": 6.027519702911377e-06, "learning_rate": 2.052777777777778e-06, "loss": 0.0, "num_tokens": 1881227.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.11967355012893677, "kl": 0.017018096521496773, "learning_rate": 2.0522222222222224e-06, "loss": 0.0007, "num_tokens": 1881494.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 116.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.07742547243833542, "kl": 0.008108192356303334, "learning_rate": 2.0516666666666668e-06, "loss": 0.0004, "num_tokens": 1881813.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 116.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0527748242020607, "kl": 0.22460339218378067, "learning_rate": 2.051111111111111e-06, "loss": 0.0117, "num_tokens": 1882138.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 116.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.19082850217819214, "kl": 0.03418544679880142, "learning_rate": 2.0505555555555555e-06, "loss": 0.0017, "num_tokens": 1882404.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 116.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.181663990020752, "kl": 0.15252315998077393, "learning_rate": 2.05e-06, "loss": 0.1622, "num_tokens": 1882703.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 116.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.012199715711176395, "kl": 0.001241397112607956, "learning_rate": 2.0494444444444446e-06, "loss": 0.0001, "num_tokens": 1882959.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 116.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.026064833626151085, "kl": 0.013996317517012358, "learning_rate": 2.048888888888889e-06, "loss": 0.0007, "num_tokens": 1883242.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 116.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.020607152953743935, "kl": 0.009501350112259388, "learning_rate": 2.0483333333333338e-06, "loss": 0.0005, "num_tokens": 1883554.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 116.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.08374211192131042, "kl": 0.008211401145672426, "learning_rate": 2.047777777777778e-06, "loss": 0.0006, "num_tokens": 1883801.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 116.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.13986767828464508, "kl": 0.020239267498254776, "learning_rate": 2.0472222222222225e-06, "loss": 0.001, "num_tokens": 1884083.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 116.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03415832668542862, "kl": 0.007188936695456505, "learning_rate": 2.046666666666667e-06, "loss": 0.0004, "num_tokens": 1884403.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.0, "frac_reward_zero_std": 0.0, "grad_norm": 4.232249736785889, "kl": 0.05148887634277344, "learning_rate": 2.046111111111111e-06, "loss": 0.3195, "num_tokens": 1884633.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 6318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 117.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01932649314403534, "kl": 0.0019014626741409302, "learning_rate": 2.0455555555555555e-06, "loss": 0.0001, "num_tokens": 1884893.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06785885244607925, "kl": 0.03098086966201663, "learning_rate": 2.045e-06, "loss": 0.0015, "num_tokens": 1885182.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 117.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.191436767578125, "kl": 0.10969037190079689, "learning_rate": 2.0444444444444447e-06, "loss": -0.0052, "num_tokens": 1885477.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.3031864166259766, "kl": 0.29227305541280657, "learning_rate": 2.043888888888889e-06, "loss": 0.0188, "num_tokens": 1885777.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 117.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06559586524963379, "kl": 0.06970850005745888, "learning_rate": 2.0433333333333334e-06, "loss": 0.0035, "num_tokens": 1886140.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 117.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 8.963214874267578, "kl": 0.05980444699525833, "learning_rate": 2.042777777777778e-06, "loss": 0.168, "num_tokens": 1886365.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 117.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.026872307062149048, "kl": 0.0028980448842048645, "learning_rate": 2.0422222222222225e-06, "loss": 0.0001, "num_tokens": 1886573.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 117.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00760700972750783, "kl": 0.001241639256477356, "learning_rate": 2.041666666666667e-06, "loss": 0.0001, "num_tokens": 1886785.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.580436110496521, "kl": 0.07058430463075638, "learning_rate": 2.0411111111111112e-06, "loss": 0.0041, "num_tokens": 1887004.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 117.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.050274718552827835, "kl": 0.014770099893212318, "learning_rate": 2.0405555555555556e-06, "loss": 0.0007, "num_tokens": 1887282.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 117.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.045702435076236725, "kl": 0.004559360444545746, "learning_rate": 2.04e-06, "loss": 0.0002, "num_tokens": 1887542.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.004470529034733772, "kl": 0.01025998592376709, "learning_rate": 2.0394444444444447e-06, "loss": 0.0005, "num_tokens": 1887778.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 117.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.09242578595876694, "kl": 0.01036973058944568, "learning_rate": 2.038888888888889e-06, "loss": 0.0007, "num_tokens": 1888034.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 117.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.054000452160835266, "kl": 0.24508358538150787, "learning_rate": 2.0383333333333334e-06, "loss": 0.0123, "num_tokens": 1888334.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 117.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04483787342905998, "kl": 0.009859418496489525, "learning_rate": 2.037777777777778e-06, "loss": 0.0005, "num_tokens": 1888625.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 117.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 1.503757357597351, "kl": 0.15551457554101944, "learning_rate": 2.0372222222222226e-06, "loss": 0.0155, "num_tokens": 1888980.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 6334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 2.0908022634102963e-05, "kl": 6.116926670074463e-06, "learning_rate": 2.036666666666667e-06, "loss": 0.0, "num_tokens": 1889200.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 117.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.3651841878890991, "kl": 0.25582121312618256, "learning_rate": 2.0361111111111113e-06, "loss": -0.0248, "num_tokens": 1889531.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 117.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09336521476507187, "kl": 0.02954252064228058, "learning_rate": 2.0355555555555556e-06, "loss": 0.0015, "num_tokens": 1889823.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 117.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08012770861387253, "kl": 0.013074666727334261, "learning_rate": 2.035e-06, "loss": 0.0007, "num_tokens": 1890126.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 117.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.009076678194105625, "kl": 0.000856570404721424, "learning_rate": 2.0344444444444448e-06, "loss": 0.0, "num_tokens": 1890388.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 117.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.3684970438480377, "kl": 0.1749100163578987, "learning_rate": 2.033888888888889e-06, "loss": 0.0088, "num_tokens": 1890681.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 117.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.1019514948129654, "kl": 0.03167406329885125, "learning_rate": 2.0333333333333335e-06, "loss": 0.0015, "num_tokens": 1891015.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 117.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.882183313369751, "kl": 0.08877290785312653, "learning_rate": 2.032777777777778e-06, "loss": -0.1063, "num_tokens": 1891322.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 117.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.18320921063423157, "kl": 0.07062910031527281, "learning_rate": 2.032222222222222e-06, "loss": 0.0038, "num_tokens": 1891665.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 117.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.028293726965785027, "kl": 0.0007381189789157361, "learning_rate": 2.031666666666667e-06, "loss": 0.0, "num_tokens": 1891922.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 117.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0569862499833107, "kl": 0.019052807241678238, "learning_rate": 2.0311111111111113e-06, "loss": 0.001, "num_tokens": 1892183.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 117.51851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.6683363914489746, "kl": 0.0125611950061284, "learning_rate": 2.0305555555555557e-06, "loss": 0.0177, "num_tokens": 1892472.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 117.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05030571296811104, "kl": 0.012750467285513878, "learning_rate": 2.0300000000000005e-06, "loss": 0.0007, "num_tokens": 1892797.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 117.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.003586031496524811, "kl": 0.0018219053745269775, "learning_rate": 2.029444444444445e-06, "loss": 0.0001, "num_tokens": 1893077.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.392856121063232, "kl": 0.053120071068406105, "learning_rate": 2.028888888888889e-06, "loss": -0.0901, "num_tokens": 1893357.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 117.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.16689318418502808, "kl": 0.1875579133629799, "learning_rate": 2.0283333333333335e-06, "loss": 0.0094, "num_tokens": 1893667.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 117.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.260375440120697, "kl": 0.033698545768857, "learning_rate": 2.027777777777778e-06, "loss": 0.0017, "num_tokens": 1893935.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06738339364528656, "kl": 0.047733768820762634, "learning_rate": 2.0272222222222222e-06, "loss": 0.0027, "num_tokens": 1894211.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 117.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01668206788599491, "kl": 0.06823649257421494, "learning_rate": 2.0266666666666666e-06, "loss": 0.0034, "num_tokens": 1894659.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 117.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.012142961844801903, "kl": 0.0012504607730079442, "learning_rate": 2.0261111111111114e-06, "loss": 0.0001, "num_tokens": 1894919.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 51.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 117.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 1.3882849216461182, "kl": 0.0591182466596365, "learning_rate": 2.0255555555555557e-06, "loss": 0.0028, "num_tokens": 1895403.0, "reward": 2.25, "reward_std": 1.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 1.5, "step": 6355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 117.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04387183114886284, "kl": 0.015235158614814281, "learning_rate": 2.025e-06, "loss": 0.0008, "num_tokens": 1895741.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 117.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.060757119208574295, "kl": 0.017239089123904705, "learning_rate": 2.024444444444445e-06, "loss": 0.0009, "num_tokens": 1896073.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 117.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.1733758449554443, "kl": 0.45842787623405457, "learning_rate": 2.0238888888888892e-06, "loss": -0.0545, "num_tokens": 1896357.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 117.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02326754666864872, "kl": 0.000525517767528072, "learning_rate": 2.0233333333333336e-06, "loss": 0.0, "num_tokens": 1896570.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 117.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05020660534501076, "kl": 0.06313306652009487, "learning_rate": 2.022777777777778e-06, "loss": 0.0029, "num_tokens": 1896886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.04792928695678711, "kl": 0.010669475421309471, "learning_rate": 2.0222222222222223e-06, "loss": 0.0005, "num_tokens": 1897158.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.12444091588258743, "kl": 0.01380720897577703, "learning_rate": 2.0216666666666667e-06, "loss": 0.0007, "num_tokens": 1897419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 117.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.011053243651986122, "kl": 0.0015502138412557542, "learning_rate": 2.021111111111111e-06, "loss": 0.0001, "num_tokens": 1897654.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 117.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.2209734469652176, "kl": 0.06316827982664108, "learning_rate": 2.0205555555555558e-06, "loss": 0.0032, "num_tokens": 1898057.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 117.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.0343759059906006, "kl": 0.02842277637682855, "learning_rate": 2.02e-06, "loss": 0.058, "num_tokens": 1898373.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05238278955221176, "kl": 0.00852542370557785, "learning_rate": 2.0194444444444445e-06, "loss": 0.0004, "num_tokens": 1898655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 117.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02787281572818756, "kl": 0.02005875576287508, "learning_rate": 2.0188888888888893e-06, "loss": 0.001, "num_tokens": 1898980.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 117.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.019578250125050545, "kl": 0.0022009711246937513, "learning_rate": 2.0183333333333336e-06, "loss": 0.0001, "num_tokens": 1899298.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 117.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 2.842864513397217, "kl": 0.07865768298506737, "learning_rate": 2.017777777777778e-06, "loss": 0.2538, "num_tokens": 1899687.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 6369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 117.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06308960914611816, "kl": 0.019311968237161636, "learning_rate": 2.0172222222222223e-06, "loss": 0.0009, "num_tokens": 1899968.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 117.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.020725080743432045, "kl": 0.009757064282894135, "learning_rate": 2.0166666666666667e-06, "loss": 0.0005, "num_tokens": 1900280.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 118.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08902864903211594, "kl": 0.1410190425813198, "learning_rate": 2.016111111111111e-06, "loss": 0.0071, "num_tokens": 1900600.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 118.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.005082110408693552, "kl": 0.0002959519624710083, "learning_rate": 2.0155555555555554e-06, "loss": 0.0, "num_tokens": 1900812.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009999692440032959, "kl": 0.0008309930562973022, "learning_rate": 2.015e-06, "loss": 0.0, "num_tokens": 1901100.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 118.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.09640976786613464, "kl": 0.055912728887051344, "learning_rate": 2.0144444444444445e-06, "loss": 0.0028, "num_tokens": 1901373.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 118.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06420952081680298, "kl": 0.02365659549832344, "learning_rate": 2.0138888888888893e-06, "loss": 0.0012, "num_tokens": 1901669.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 118.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.3243865966796875, "kl": 0.09394191950559616, "learning_rate": 2.0133333333333337e-06, "loss": 0.186, "num_tokens": 1902037.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 6377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 118.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03543952852487564, "kl": 0.006136229378171265, "learning_rate": 2.012777777777778e-06, "loss": 0.0003, "num_tokens": 1902307.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 118.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.989591598510742, "kl": 0.0733509985730052, "learning_rate": 2.0122222222222224e-06, "loss": 0.0476, "num_tokens": 1902605.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 118.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06498600542545319, "kl": 0.06290274299681187, "learning_rate": 2.0116666666666667e-06, "loss": 0.0031, "num_tokens": 1902937.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.5110940933227539, "kl": 0.0552606723504141, "learning_rate": 2.011111111111111e-06, "loss": 0.0028, "num_tokens": 1903197.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.03292812407016754, "kl": 0.002601589832920581, "learning_rate": 2.0105555555555555e-06, "loss": 0.0001, "num_tokens": 1903459.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 118.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.053160227835178375, "kl": 0.026907450519502163, "learning_rate": 2.0100000000000002e-06, "loss": 0.0013, "num_tokens": 1903793.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 118.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03671453520655632, "kl": 0.2030920023098588, "learning_rate": 2.0094444444444446e-06, "loss": 0.0092, "num_tokens": 1904125.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 118.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.013928708620369434, "kl": 0.0017872151220217347, "learning_rate": 2.008888888888889e-06, "loss": 0.0001, "num_tokens": 1904389.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 118.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 1.9947103261947632, "kl": 0.1632765829563141, "learning_rate": 2.0083333333333337e-06, "loss": -0.1175, "num_tokens": 1904746.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 6386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 118.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07967539876699448, "kl": 0.04094206169247627, "learning_rate": 2.007777777777778e-06, "loss": 0.0021, "num_tokens": 1905102.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.29629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 6.3947858810424805, "kl": 0.04415374994277954, "learning_rate": 2.0072222222222224e-06, "loss": 0.2742, "num_tokens": 1905435.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 6388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 118.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.1404670774936676, "kl": 0.0453020054847002, "learning_rate": 2.006666666666667e-06, "loss": 0.0024, "num_tokens": 1905707.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 118.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04152649641036987, "kl": 0.03035030886530876, "learning_rate": 2.006111111111111e-06, "loss": 0.0016, "num_tokens": 1905998.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 118.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03203443065285683, "kl": 0.014290764462202787, "learning_rate": 2.0055555555555555e-06, "loss": 0.0007, "num_tokens": 1906328.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 118.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03311743959784508, "kl": 0.013506029732525349, "learning_rate": 2.0050000000000003e-06, "loss": 0.0007, "num_tokens": 1906589.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 118.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.6702953577041626, "kl": 0.15641546994447708, "learning_rate": 2.0044444444444446e-06, "loss": -0.0624, "num_tokens": 1906945.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 118.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04247508943080902, "kl": 0.00272684870287776, "learning_rate": 2.003888888888889e-06, "loss": 0.0001, "num_tokens": 1907179.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 118.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.6384313106536865, "kl": 0.17362366616725922, "learning_rate": 2.0033333333333334e-06, "loss": 0.1785, "num_tokens": 1907488.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 118.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.1509215533733368, "kl": 0.18240522593259811, "learning_rate": 2.002777777777778e-06, "loss": 0.0092, "num_tokens": 1907819.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 118.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.22809985280036926, "kl": 0.03722766786813736, "learning_rate": 2.0022222222222225e-06, "loss": 0.0021, "num_tokens": 1908139.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 118.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.022495785728096962, "kl": 0.0039089208003133535, "learning_rate": 2.001666666666667e-06, "loss": 0.0002, "num_tokens": 1908416.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.039546508342027664, "kl": 0.24273832887411118, "learning_rate": 2.001111111111111e-06, "loss": 0.0121, "num_tokens": 1908716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 118.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04990554228425026, "kl": 0.011784927919507027, "learning_rate": 2.0005555555555556e-06, "loss": 0.0006, "num_tokens": 1908985.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 118.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 24.81891632080078, "kl": 0.05277954787015915, "learning_rate": 2.0000000000000003e-06, "loss": 0.161, "num_tokens": 1909191.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 6401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 118.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.019714904949069023, "kl": 0.010155208874493837, "learning_rate": 1.9994444444444447e-06, "loss": 0.0005, "num_tokens": 1909507.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 118.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.7609360218048096, "kl": 0.08150198310613632, "learning_rate": 1.998888888888889e-06, "loss": 0.006, "num_tokens": 1909821.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 6403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 118.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 4.15480899810791, "kl": 0.023040286265313625, "learning_rate": 1.9983333333333334e-06, "loss": 0.2364, "num_tokens": 1910161.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 118.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069944425486028194, "kl": 0.28136877715587616, "learning_rate": 1.9977777777777778e-06, "loss": 0.0141, "num_tokens": 1910449.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 118.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1392611265182495, "kl": 0.013951772067230195, "learning_rate": 1.9972222222222225e-06, "loss": 0.0006, "num_tokens": 1910668.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 118.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02746611461043358, "kl": 0.018195753917098045, "learning_rate": 1.996666666666667e-06, "loss": 0.0009, "num_tokens": 1910944.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 118.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.016037823632359505, "kl": 0.005855498369783163, "learning_rate": 1.9961111111111112e-06, "loss": 0.0003, "num_tokens": 1911228.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08595984429121017, "kl": 0.017973463982343674, "learning_rate": 1.995555555555556e-06, "loss": 0.0009, "num_tokens": 1911521.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 118.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06628895550966263, "kl": 0.00913315312936902, "learning_rate": 1.9950000000000004e-06, "loss": 0.0004, "num_tokens": 1911823.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 118.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014316902495920658, "kl": 0.046031415462493896, "learning_rate": 1.9944444444444447e-06, "loss": 0.0023, "num_tokens": 1912283.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 118.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.014442003332078457, "kl": 0.06937727332115173, "learning_rate": 1.993888888888889e-06, "loss": 0.0035, "num_tokens": 1912728.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.09784216433763504, "kl": 0.01201861584559083, "learning_rate": 1.9933333333333334e-06, "loss": 0.0006, "num_tokens": 1912994.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 84.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 84.0, "completions/mean_terminated_length": 26.666667938232422, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 118.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.6840860843658447, "kl": 0.13889512047171593, "learning_rate": 1.992777777777778e-06, "loss": 0.4143, "num_tokens": 1913554.0, "reward": 6.050000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 6.050000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 6414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 118.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036769004072993994, "kl": 0.010485455393791199, "learning_rate": 1.992222222222222e-06, "loss": 0.0005, "num_tokens": 1913790.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 118.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.11246366798877716, "kl": 0.03549245931208134, "learning_rate": 1.991666666666667e-06, "loss": 0.0018, "num_tokens": 1914009.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 118.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.013193209655582905, "kl": 0.0019473450956866145, "learning_rate": 1.9911111111111113e-06, "loss": 0.0001, "num_tokens": 1914328.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 118.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010143141844309866, "kl": 9.469687938690186e-06, "learning_rate": 1.9905555555555556e-06, "loss": 0.0, "num_tokens": 1914548.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 118.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075456672348082066, "kl": 0.0019497275352478027, "learning_rate": 1.9900000000000004e-06, "loss": 0.0001, "num_tokens": 1914760.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 118.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.011065198108553886, "kl": 0.0005649402737617493, "learning_rate": 1.9894444444444448e-06, "loss": 0.0, "num_tokens": 1915004.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 118.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02632279507815838, "kl": 0.0008327096293214709, "learning_rate": 1.988888888888889e-06, "loss": 0.0, "num_tokens": 1915260.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 118.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05379870533943176, "kl": 0.17307216674089432, "learning_rate": 1.9883333333333335e-06, "loss": 0.0087, "num_tokens": 1915570.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 118.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.031470537185668945, "kl": 0.00619518430903554, "learning_rate": 1.987777777777778e-06, "loss": 0.0003, "num_tokens": 1915870.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 118.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.5630033016204834, "kl": 0.04205698613077402, "learning_rate": 1.987222222222222e-06, "loss": -0.1052, "num_tokens": 1916197.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 118.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.11929194629192352, "kl": 0.0366975162178278, "learning_rate": 1.9866666666666666e-06, "loss": 0.0018, "num_tokens": 1916609.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 119.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.006128491833806038, "kl": 0.0034243129193782806, "learning_rate": 1.9861111111111113e-06, "loss": 0.0002, "num_tokens": 1916869.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 119.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.617873191833496, "kl": 0.056398432701826096, "learning_rate": 1.9855555555555557e-06, "loss": -0.1874, "num_tokens": 1917241.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 6427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 119.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.031006839126348495, "kl": 0.007423124276101589, "learning_rate": 1.985e-06, "loss": 0.0004, "num_tokens": 1917534.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 119.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0222166795283556, "kl": 0.0023114101495593786, "learning_rate": 1.984444444444445e-06, "loss": 0.0001, "num_tokens": 1917854.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 119.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0680152028799057, "kl": 0.015723556745797396, "learning_rate": 1.983888888888889e-06, "loss": 0.0007, "num_tokens": 1918160.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 119.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08841664344072342, "kl": 0.013131663203239441, "learning_rate": 1.9833333333333335e-06, "loss": 0.0007, "num_tokens": 1918364.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 119.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01534538995474577, "kl": 0.01497855456545949, "learning_rate": 1.982777777777778e-06, "loss": 0.0007, "num_tokens": 1918624.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 119.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05719609186053276, "kl": 0.002326124464161694, "learning_rate": 1.9822222222222223e-06, "loss": 0.0001, "num_tokens": 1918858.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 119.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.023050324991345406, "kl": 0.0035633776569738984, "learning_rate": 1.9816666666666666e-06, "loss": 0.0002, "num_tokens": 1919120.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 119.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.07908120006322861, "kl": 0.014268849045038223, "learning_rate": 1.981111111111111e-06, "loss": 0.0007, "num_tokens": 1919392.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 119.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.129968523979187, "kl": 0.09437369555234909, "learning_rate": 1.9805555555555557e-06, "loss": 0.0045, "num_tokens": 1919811.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 119.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 3.7316091060638428, "kl": 0.5576530005782843, "learning_rate": 1.98e-06, "loss": 0.028, "num_tokens": 1920126.0, "reward": 5.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 3.674234628677368, "step": 6437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 119.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.1251935958862305, "kl": 0.07591760344803333, "learning_rate": 1.9794444444444445e-06, "loss": 0.1255, "num_tokens": 1920498.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 6438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 119.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.10176708549261093, "kl": 0.038629982620477676, "learning_rate": 1.9788888888888892e-06, "loss": 0.002, "num_tokens": 1920743.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 119.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.033448223024606705, "kl": 0.254938043653965, "learning_rate": 1.9783333333333336e-06, "loss": 0.0127, "num_tokens": 1921041.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 119.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.2409822642803192, "kl": 0.03991745691746473, "learning_rate": 1.977777777777778e-06, "loss": 0.0024, "num_tokens": 1921309.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 119.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011350107524776831, "kl": 1.0266900062561035e-05, "learning_rate": 1.9772222222222223e-06, "loss": 0.0, "num_tokens": 1921529.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 119.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.007630377542227507, "kl": 0.001873701810836792, "learning_rate": 1.9766666666666667e-06, "loss": 0.0001, "num_tokens": 1921741.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 119.33333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.9522647857666016, "kl": 0.1635643132030964, "learning_rate": 1.976111111111111e-06, "loss": 0.0668, "num_tokens": 1922103.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 6444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 119.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.021627090871334076, "kl": 0.003841559519059956, "learning_rate": 1.975555555555556e-06, "loss": 0.0002, "num_tokens": 1922380.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 119.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.2264357954263687, "kl": 0.03724466264247894, "learning_rate": 1.975e-06, "loss": 0.0019, "num_tokens": 1922680.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 119.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.4393234252929688, "kl": 0.06331833824515343, "learning_rate": 1.9744444444444445e-06, "loss": 0.0257, "num_tokens": 1923051.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 119.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03791315481066704, "kl": 0.007764843758195639, "learning_rate": 1.9738888888888893e-06, "loss": 0.0004, "num_tokens": 1923335.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 119.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.05421748012304306, "kl": 0.0018051937222480774, "learning_rate": 1.9733333333333336e-06, "loss": 0.0001, "num_tokens": 1923548.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 119.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 3.3316471576690674, "kl": 0.13614368438720703, "learning_rate": 1.972777777777778e-06, "loss": 0.0335, "num_tokens": 1923892.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 6450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 119.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04870281741023064, "kl": 0.015892754308879375, "learning_rate": 1.9722222222222224e-06, "loss": 0.0008, "num_tokens": 1924181.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 119.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07301069796085358, "kl": 0.013116242364048958, "learning_rate": 1.9716666666666667e-06, "loss": 0.0007, "num_tokens": 1924455.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 119.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.5623862743377686, "kl": 0.20443397108465433, "learning_rate": 1.971111111111111e-06, "loss": -0.0447, "num_tokens": 1924767.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 6453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 119.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08695565164089203, "kl": 0.017960233613848686, "learning_rate": 1.970555555555556e-06, "loss": 0.0009, "num_tokens": 1925035.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 119.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.044240858405828476, "kl": 0.010412337840534747, "learning_rate": 1.97e-06, "loss": 0.0005, "num_tokens": 1925339.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 119.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0408710241317749, "kl": 0.041423505172133446, "learning_rate": 1.9694444444444446e-06, "loss": 0.002, "num_tokens": 1925704.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 119.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05667131021618843, "kl": 0.028361966833472252, "learning_rate": 1.968888888888889e-06, "loss": 0.0014, "num_tokens": 1925923.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 119.5925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.2878308296203613, "kl": 0.3449918180704117, "learning_rate": 1.9683333333333337e-06, "loss": 0.0272, "num_tokens": 1926245.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6458 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 119.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.2490384578704834, "kl": 0.17745987325906754, "learning_rate": 1.967777777777778e-06, "loss": -0.0544, "num_tokens": 1926577.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 6459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 119.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038346066139638424, "kl": 0.010449804365634918, "learning_rate": 1.9672222222222224e-06, "loss": 0.0005, "num_tokens": 1926813.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 119.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10492925345897675, "kl": 0.09332165867090225, "learning_rate": 1.9666666666666668e-06, "loss": 0.0047, "num_tokens": 1927132.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 119.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.01951487921178341, "kl": 0.009429986588656902, "learning_rate": 1.966111111111111e-06, "loss": 0.0005, "num_tokens": 1927420.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 119.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.06126157566905022, "kl": 0.017302922904491425, "learning_rate": 1.965555555555556e-06, "loss": 0.0009, "num_tokens": 1927749.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 119.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04719242826104164, "kl": 0.004590743687003851, "learning_rate": 1.9650000000000002e-06, "loss": 0.0002, "num_tokens": 1928011.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 119.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.006553266197443008, "kl": 0.04418064281344414, "learning_rate": 1.9644444444444446e-06, "loss": 0.0022, "num_tokens": 1928479.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 119.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.08638431876897812, "kl": 0.01235174760222435, "learning_rate": 1.963888888888889e-06, "loss": 0.0006, "num_tokens": 1928739.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 119.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039452617056667805, "kl": 0.00027898550615645945, "learning_rate": 1.9633333333333333e-06, "loss": 0.0, "num_tokens": 1928959.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 119.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0073820557445287704, "kl": 0.28124503791332245, "learning_rate": 1.962777777777778e-06, "loss": 0.0141, "num_tokens": 1929247.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 119.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.13615044951438904, "kl": 0.012785591650754213, "learning_rate": 1.9622222222222224e-06, "loss": 0.0006, "num_tokens": 1929513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 119.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.017850253731012344, "kl": 0.000328943133354187, "learning_rate": 1.961666666666667e-06, "loss": 0.0, "num_tokens": 1929769.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 119.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.015319770202040672, "kl": 0.17293661087751389, "learning_rate": 1.9611111111111116e-06, "loss": 0.0086, "num_tokens": 1930077.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 119.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.4748814105987549, "kl": 0.07984197977930307, "learning_rate": 1.960555555555556e-06, "loss": 0.0042, "num_tokens": 1930371.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 119.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04073766991496086, "kl": 0.00794592616148293, "learning_rate": 1.9600000000000003e-06, "loss": 0.0004, "num_tokens": 1930671.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 119.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06303585320711136, "kl": 0.02688495721668005, "learning_rate": 1.9594444444444446e-06, "loss": 0.0013, "num_tokens": 1931003.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 119.9074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 4.488112449645996, "kl": 0.017169796861708164, "learning_rate": 1.958888888888889e-06, "loss": 0.1338, "num_tokens": 1931291.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 119.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.5804576873779297, "kl": 0.2464837282896042, "learning_rate": 1.9583333333333334e-06, "loss": 0.0373, "num_tokens": 1931646.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 119.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.018063750118017197, "kl": 0.012790998443961143, "learning_rate": 1.9577777777777777e-06, "loss": 0.0006, "num_tokens": 1931958.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 119.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12249667197465897, "kl": 0.015453476458787918, "learning_rate": 1.9572222222222225e-06, "loss": 0.0008, "num_tokens": 1932224.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 119.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0285570677369833, "kl": 0.0021534846746362746, "learning_rate": 1.956666666666667e-06, "loss": 0.0001, "num_tokens": 1932485.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 120.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.2294397354125977, "kl": 0.20017245411872864, "learning_rate": 1.956111111111111e-06, "loss": 0.0365, "num_tokens": 1932771.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 120.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02521766535937786, "kl": 0.0005848780274391174, "learning_rate": 1.955555555555556e-06, "loss": 0.0, "num_tokens": 1932983.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 120.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.339416980743408, "kl": 0.20131179690361023, "learning_rate": 1.9550000000000003e-06, "loss": 0.0911, "num_tokens": 1933325.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 120.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.9744746685028076, "kl": 0.108348548412323, "learning_rate": 1.9544444444444447e-06, "loss": -0.1018, "num_tokens": 1933657.0, "reward": 5.625, "reward_std": 2.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 2.75, "step": 6483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 120.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06361526250839233, "kl": 0.01797039620578289, "learning_rate": 1.953888888888889e-06, "loss": 0.0009, "num_tokens": 1933980.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.16207140684127808, "kl": 0.037412811536341906, "learning_rate": 1.9533333333333334e-06, "loss": 0.002, "num_tokens": 1934278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 120.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07378528267145157, "kl": 0.006324201822280884, "learning_rate": 1.9527777777777778e-06, "loss": 0.0003, "num_tokens": 1934484.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02191968634724617, "kl": 0.002932181698270142, "learning_rate": 1.952222222222222e-06, "loss": 0.0001, "num_tokens": 1934761.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 120.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.8043277263641357, "kl": 0.12593835219740868, "learning_rate": 1.951666666666667e-06, "loss": -0.3837, "num_tokens": 1935136.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 6488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 120.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0935901403427124, "kl": 0.025010783225297928, "learning_rate": 1.9511111111111113e-06, "loss": 0.0012, "num_tokens": 1935436.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 120.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.06641992926597595, "kl": 0.019183735363185406, "learning_rate": 1.9505555555555556e-06, "loss": 0.001, "num_tokens": 1935725.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 120.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.061356306076049805, "kl": 0.24153731763362885, "learning_rate": 1.9500000000000004e-06, "loss": 0.0123, "num_tokens": 1936010.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 120.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.7345752716064453, "kl": 0.09552976489067078, "learning_rate": 1.9494444444444447e-06, "loss": 0.0125, "num_tokens": 1936300.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 120.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.00767434062436223, "kl": 0.0018703937530517578, "learning_rate": 1.948888888888889e-06, "loss": 0.0001, "num_tokens": 1936512.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 120.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.2616035044193268, "kl": 0.11149190366268158, "learning_rate": 1.9483333333333335e-06, "loss": 0.0057, "num_tokens": 1936886.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 120.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01431599073112011, "kl": 0.0019892871496267617, "learning_rate": 1.947777777777778e-06, "loss": 0.0001, "num_tokens": 1937146.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 120.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03802303224802017, "kl": 0.009507924783974886, "learning_rate": 1.947222222222222e-06, "loss": 0.0005, "num_tokens": 1937438.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 120.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.1803891807794571, "kl": 0.10679489001631737, "learning_rate": 1.9466666666666665e-06, "loss": 0.0053, "num_tokens": 1937745.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 120.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03688585013151169, "kl": 0.01846693642437458, "learning_rate": 1.9461111111111113e-06, "loss": 0.0009, "num_tokens": 1938079.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 120.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04802023991942406, "kl": 0.08780543133616447, "learning_rate": 1.9455555555555557e-06, "loss": 0.0043, "num_tokens": 1938494.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 120.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.542801022529602, "kl": 0.06694403663277626, "learning_rate": 1.945e-06, "loss": 0.0032, "num_tokens": 1938850.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 120.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.0851104259490967, "kl": 0.39086587727069855, "learning_rate": 1.944444444444445e-06, "loss": -0.0414, "num_tokens": 1939293.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 6501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 120.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.019404016435146332, "kl": 0.012362733483314514, "learning_rate": 1.943888888888889e-06, "loss": 0.0006, "num_tokens": 1939605.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 120.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 1.9222360849380493, "kl": 0.14974160864949226, "learning_rate": 1.9433333333333335e-06, "loss": 0.0677, "num_tokens": 1939987.0, "reward": 3.625, "reward_std": 2.8975563049316406, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 2.8975565433502197, "step": 6503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 120.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0246490016579628, "kl": 0.0014408514252863824, "learning_rate": 1.942777777777778e-06, "loss": 0.0001, "num_tokens": 1940222.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 120.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.049587640911340714, "kl": 0.005654967622831464, "learning_rate": 1.9422222222222222e-06, "loss": 0.0003, "num_tokens": 1940518.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 120.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03195283189415932, "kl": 0.24108757823705673, "learning_rate": 1.9416666666666666e-06, "loss": 0.012, "num_tokens": 1940818.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 120.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07869917899370193, "kl": 0.06443023309111595, "learning_rate": 1.9411111111111113e-06, "loss": 0.0031, "num_tokens": 1941189.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 120.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012037131091346964, "kl": 1.0520219802856445e-05, "learning_rate": 1.9405555555555557e-06, "loss": 0.0, "num_tokens": 1941409.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 120.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.13927407562732697, "kl": 0.010929748881608248, "learning_rate": 1.94e-06, "loss": 0.0007, "num_tokens": 1941634.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05705585703253746, "kl": 0.012273144442588091, "learning_rate": 1.939444444444445e-06, "loss": 0.0006, "num_tokens": 1941921.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 120.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.004153565503656864, "kl": 0.010400138795375824, "learning_rate": 1.938888888888889e-06, "loss": 0.0005, "num_tokens": 1942157.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 120.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.020041082054376602, "kl": 0.0034822411835193634, "learning_rate": 1.9383333333333336e-06, "loss": 0.0002, "num_tokens": 1942417.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 120.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05672154203057289, "kl": 0.026305489242076874, "learning_rate": 1.937777777777778e-06, "loss": 0.0013, "num_tokens": 1942640.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 120.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.048957809805870056, "kl": 0.013490078505128622, "learning_rate": 1.9372222222222223e-06, "loss": 0.0007, "num_tokens": 1942942.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 120.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.85786247253418, "kl": 0.05061430484056473, "learning_rate": 1.9366666666666666e-06, "loss": 0.0276, "num_tokens": 1943187.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 120.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03714510053396225, "kl": 0.003538968274369836, "learning_rate": 1.9361111111111114e-06, "loss": 0.0002, "num_tokens": 1943445.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05522409453988075, "kl": 0.05054583214223385, "learning_rate": 1.9355555555555558e-06, "loss": 0.0028, "num_tokens": 1943721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 120.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.019498778507113457, "kl": 0.00033126772177638486, "learning_rate": 1.935e-06, "loss": 0.0, "num_tokens": 1943977.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 120.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.9038255214691162, "kl": 0.13639210909605026, "learning_rate": 1.9344444444444445e-06, "loss": 0.0072, "num_tokens": 1944300.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 120.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.016522077843546867, "kl": 0.014803973026573658, "learning_rate": 1.9338888888888892e-06, "loss": 0.0007, "num_tokens": 1944560.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 120.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.09604573994874954, "kl": 0.05625455221161246, "learning_rate": 1.9333333333333336e-06, "loss": 0.0028, "num_tokens": 1944832.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 120.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.1670405864715576, "kl": 0.2035636603832245, "learning_rate": 1.932777777777778e-06, "loss": 0.0115, "num_tokens": 1945116.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 120.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 4.179937839508057, "kl": 0.016440263483673334, "learning_rate": 1.9322222222222223e-06, "loss": 0.0333, "num_tokens": 1945384.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 120.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.04339351877570152, "kl": 0.051138926297426224, "learning_rate": 1.9316666666666667e-06, "loss": 0.0026, "num_tokens": 1945754.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 120.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.11538664996623993, "kl": 0.009717530105262995, "learning_rate": 1.9311111111111114e-06, "loss": 0.0005, "num_tokens": 1946024.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 120.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.007945375517010689, "kl": 0.0025849855737760663, "learning_rate": 1.930555555555556e-06, "loss": 0.0001, "num_tokens": 1946336.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 120.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.090182781219482, "kl": 0.20719069987535477, "learning_rate": 1.93e-06, "loss": 0.1187, "num_tokens": 1946653.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 120.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.3397001326084137, "kl": 0.07729673385620117, "learning_rate": 1.9294444444444445e-06, "loss": 0.0045, "num_tokens": 1946933.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 120.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09474819898605347, "kl": 0.1433454230427742, "learning_rate": 1.928888888888889e-06, "loss": 0.0072, "num_tokens": 1947273.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 120.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.019495802000164986, "kl": 0.004136958159506321, "learning_rate": 1.9283333333333336e-06, "loss": 0.0002, "num_tokens": 1947541.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 120.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.07349266856908798, "kl": 0.014702288433909416, "learning_rate": 1.927777777777778e-06, "loss": 0.0007, "num_tokens": 1947819.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 120.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03636496886610985, "kl": 0.013339686207473278, "learning_rate": 1.9272222222222224e-06, "loss": 0.0007, "num_tokens": 1948135.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 120.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.1595379114151001, "kl": 0.02149766217917204, "learning_rate": 1.926666666666667e-06, "loss": 0.0011, "num_tokens": 1948421.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 121.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.012015879154205322, "kl": 0.0019246787996962667, "learning_rate": 1.9261111111111115e-06, "loss": 0.0001, "num_tokens": 1948740.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 121.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.060225583612918854, "kl": 0.010697120800614357, "learning_rate": 1.925555555555556e-06, "loss": 0.0005, "num_tokens": 1949012.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 121.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04163579270243645, "kl": 0.015399678610265255, "learning_rate": 1.925e-06, "loss": 0.0008, "num_tokens": 1949342.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 121.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.009569196030497551, "kl": 0.001042235642671585, "learning_rate": 1.9244444444444446e-06, "loss": 0.0001, "num_tokens": 1949586.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 121.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.012683825567364693, "kl": 0.0035776831209659576, "learning_rate": 1.923888888888889e-06, "loss": 0.0002, "num_tokens": 1949846.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 121.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.029448438435792923, "kl": 0.0005386561097111553, "learning_rate": 1.9233333333333333e-06, "loss": 0.0, "num_tokens": 1950059.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 121.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.4642410278320312, "kl": 0.06212102249264717, "learning_rate": 1.922777777777778e-06, "loss": 0.0942, "num_tokens": 1950387.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 121.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004147065803408623, "kl": 0.01041381061077118, "learning_rate": 1.9222222222222224e-06, "loss": 0.0005, "num_tokens": 1950623.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 121.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.015492178499698639, "kl": 0.015001409687101841, "learning_rate": 1.9216666666666668e-06, "loss": 0.0008, "num_tokens": 1950883.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 121.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.23359765112400055, "kl": 0.141707643866539, "learning_rate": 1.9211111111111115e-06, "loss": 0.0071, "num_tokens": 1951244.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 121.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.028401212766766548, "kl": 0.2542877569794655, "learning_rate": 1.920555555555556e-06, "loss": 0.0127, "num_tokens": 1951542.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 121.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022804702166467905, "kl": 0.00014090240438235924, "learning_rate": 1.9200000000000003e-06, "loss": 0.0, "num_tokens": 1951798.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 121.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02908479981124401, "kl": 0.00393561169039458, "learning_rate": 1.9194444444444446e-06, "loss": 0.0002, "num_tokens": 1952072.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 121.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.05095456540584564, "kl": 0.009257306810468435, "learning_rate": 1.918888888888889e-06, "loss": 0.0005, "num_tokens": 1952364.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 121.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.3533603549003601, "kl": 0.061590869911015034, "learning_rate": 1.9183333333333333e-06, "loss": 0.0032, "num_tokens": 1952697.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 121.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.205292701721191, "kl": 0.03627455234527588, "learning_rate": 1.9177777777777777e-06, "loss": 0.0353, "num_tokens": 1952971.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 121.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962321251630783, "kl": 0.02130901685450226, "learning_rate": 1.9172222222222225e-06, "loss": 0.001, "num_tokens": 1953275.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 121.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.09418157488107681, "kl": 0.008412047289311886, "learning_rate": 1.916666666666667e-06, "loss": 0.0004, "num_tokens": 1953541.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 121.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.019669540226459503, "kl": 0.006216781213879585, "learning_rate": 1.916111111111111e-06, "loss": 0.0003, "num_tokens": 1953853.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 121.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.26276618242263794, "kl": 0.1119781844317913, "learning_rate": 1.915555555555556e-06, "loss": 0.0057, "num_tokens": 1954228.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 121.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06343358755111694, "kl": 0.018441941123455763, "learning_rate": 1.9150000000000003e-06, "loss": 0.0009, "num_tokens": 1954519.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 121.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04465528950095177, "kl": 0.006866212352178991, "learning_rate": 1.9144444444444447e-06, "loss": 0.0003, "num_tokens": 1954755.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 121.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05876487120985985, "kl": 0.23161007463932037, "learning_rate": 1.913888888888889e-06, "loss": 0.0118, "num_tokens": 1955040.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 121.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04281169921159744, "kl": 0.20476361084729433, "learning_rate": 1.9133333333333334e-06, "loss": 0.0105, "num_tokens": 1955371.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 121.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.028130121529102325, "kl": 0.0029474864713847637, "learning_rate": 1.9127777777777777e-06, "loss": 0.0001, "num_tokens": 1955692.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 121.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09994383156299591, "kl": 0.021512024104595184, "learning_rate": 1.912222222222222e-06, "loss": 0.0011, "num_tokens": 1955975.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 121.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.15383422374725342, "kl": 0.045628881081938744, "learning_rate": 1.911666666666667e-06, "loss": 0.0023, "num_tokens": 1956309.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 121.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.011898176744580269, "kl": 0.0013358891010284424, "learning_rate": 1.9111111111111112e-06, "loss": 0.0001, "num_tokens": 1956569.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 121.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01552923396229744, "kl": 0.004067820031195879, "learning_rate": 1.9105555555555556e-06, "loss": 0.0002, "num_tokens": 1956835.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 121.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03916758671402931, "kl": 0.1639244109392166, "learning_rate": 1.9100000000000003e-06, "loss": 0.0082, "num_tokens": 1957119.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 121.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.05532076582312584, "kl": 0.01927832141518593, "learning_rate": 1.9094444444444447e-06, "loss": 0.001, "num_tokens": 1957435.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 121.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04299955815076828, "kl": 0.003602264914661646, "learning_rate": 1.908888888888889e-06, "loss": 0.0002, "num_tokens": 1957695.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 121.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.33177894353866577, "kl": 0.05730462446808815, "learning_rate": 1.9083333333333334e-06, "loss": 0.0029, "num_tokens": 1957983.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 121.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.2842016816139221, "kl": 0.08097782731056213, "learning_rate": 1.9077777777777778e-06, "loss": 0.0043, "num_tokens": 1958302.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 121.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.029925130307674408, "kl": 0.0035382889327593148, "learning_rate": 1.9072222222222223e-06, "loss": 0.0002, "num_tokens": 1958598.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 121.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0920930728316307, "kl": 0.18234994262456894, "learning_rate": 1.906666666666667e-06, "loss": 0.0091, "num_tokens": 1958907.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 121.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.007455828599631786, "kl": 0.0018129795789718628, "learning_rate": 1.9061111111111113e-06, "loss": 0.0001, "num_tokens": 1959119.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 121.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.018305161967873573, "kl": 0.0032043084502220154, "learning_rate": 1.9055555555555558e-06, "loss": 0.0001, "num_tokens": 1959329.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 121.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.2862795293331146, "kl": 0.11828611046075821, "learning_rate": 1.9050000000000002e-06, "loss": 0.006, "num_tokens": 1959626.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 121.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.004563179332762957, "kl": 0.0004115998890483752, "learning_rate": 1.9044444444444445e-06, "loss": 0.0, "num_tokens": 1959846.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6573 }, { "clip_ratio/high_max": 0.007575757801532745, "clip_ratio/high_mean": 0.007575757801532745, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007575757801532745, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 121.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 1.77127206325531, "kl": 0.13749685138463974, "learning_rate": 1.9038888888888891e-06, "loss": -0.1003, "num_tokens": 1960256.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 121.75925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 5.323081016540527, "kl": 0.5849427469074726, "learning_rate": 1.9033333333333335e-06, "loss": 0.2261, "num_tokens": 1960618.0, "reward": 4.625, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 4.308422088623047, "step": 6575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 121.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06007037311792374, "kl": 0.009533826727420092, "learning_rate": 1.9027777777777778e-06, "loss": 0.0005, "num_tokens": 1960900.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 121.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.16993506252765656, "kl": 0.14381812512874603, "learning_rate": 1.9022222222222222e-06, "loss": 0.0072, "num_tokens": 1961251.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 121.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.030439620837569237, "kl": 0.047766825184226036, "learning_rate": 1.901666666666667e-06, "loss": 0.0024, "num_tokens": 1961706.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 121.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.02379870042204857, "kl": 0.038969412446022034, "learning_rate": 1.9011111111111113e-06, "loss": 0.002, "num_tokens": 1962077.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 121.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.7396790981292725, "kl": 0.01768910512328148, "learning_rate": 1.9005555555555557e-06, "loss": -0.0249, "num_tokens": 1962373.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6580 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 121.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.4093854427337646, "kl": 0.05246824771165848, "learning_rate": 1.9000000000000002e-06, "loss": 0.2464, "num_tokens": 1962775.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 6581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 121.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05882710590958595, "kl": 0.020187518559396267, "learning_rate": 1.8994444444444446e-06, "loss": 0.001, "num_tokens": 1963076.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 121.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.05334044620394707, "kl": 0.02710147574543953, "learning_rate": 1.898888888888889e-06, "loss": 0.0015, "num_tokens": 1963348.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 121.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010970969742629677, "kl": 9.864568710327148e-06, "learning_rate": 1.8983333333333335e-06, "loss": 0.0, "num_tokens": 1963568.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 121.94444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 1.8761250972747803, "kl": 0.10061680525541306, "learning_rate": 1.8977777777777779e-06, "loss": 0.0073, "num_tokens": 1963951.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 6585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 121.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03571518510580063, "kl": 0.008891834877431393, "learning_rate": 1.8972222222222222e-06, "loss": 0.0004, "num_tokens": 1964260.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 121.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.044207263737916946, "kl": 0.02818226907402277, "learning_rate": 1.896666666666667e-06, "loss": 0.0014, "num_tokens": 1964479.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 122.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08974740654230118, "kl": 0.007250359980389476, "learning_rate": 1.8961111111111114e-06, "loss": 0.0004, "num_tokens": 1964747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 122.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.921267509460449, "kl": 0.02455890364944935, "learning_rate": 1.8955555555555557e-06, "loss": 0.296, "num_tokens": 1965031.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 122.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038552992045879364, "kl": 0.010447412729263306, "learning_rate": 1.895e-06, "loss": 0.0005, "num_tokens": 1965267.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 122.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.004565917421132326, "kl": 0.0038652466610074043, "learning_rate": 1.8944444444444446e-06, "loss": 0.0002, "num_tokens": 1965531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 122.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03334076702594757, "kl": 0.003623455762863159, "learning_rate": 1.893888888888889e-06, "loss": 0.0002, "num_tokens": 1965823.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 122.0925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 2.9947996139526367, "kl": 0.20124412328004837, "learning_rate": 1.8933333333333333e-06, "loss": -0.1217, "num_tokens": 1966250.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 6593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 122.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0194789320230484, "kl": 0.012109816074371338, "learning_rate": 1.892777777777778e-06, "loss": 0.0006, "num_tokens": 1966562.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 122.12962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.3385567665100098, "kl": 0.09086649864912033, "learning_rate": 1.8922222222222225e-06, "loss": 0.0445, "num_tokens": 1966937.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 122.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.004390359856188297, "kl": 0.00035235284303780645, "learning_rate": 1.8916666666666668e-06, "loss": 0.0, "num_tokens": 1967157.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 122.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.05296225845813751, "kl": 0.0036581556778401136, "learning_rate": 1.8911111111111114e-06, "loss": 0.0002, "num_tokens": 1967483.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 122.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048139868304133415, "kl": 0.0002743750810623169, "learning_rate": 1.8905555555555558e-06, "loss": 0.0, "num_tokens": 1967695.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 122.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06874033063650131, "kl": 0.01985176093876362, "learning_rate": 1.8900000000000001e-06, "loss": 0.0011, "num_tokens": 1967970.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 122.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014146380126476288, "kl": 0.0008700191974639893, "learning_rate": 1.8894444444444447e-06, "loss": 0.0, "num_tokens": 1968206.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 122.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.029452821239829063, "kl": 0.23662805557250977, "learning_rate": 1.888888888888889e-06, "loss": 0.0118, "num_tokens": 1968507.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 122.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.006610692013055086, "kl": 0.0029504895210266113, "learning_rate": 1.8883333333333334e-06, "loss": 0.0001, "num_tokens": 1968819.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 122.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.025377707555890083, "kl": 0.009135246276855469, "learning_rate": 1.8877777777777777e-06, "loss": 0.0005, "num_tokens": 1969107.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 122.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.05212317034602165, "kl": 0.028674963861703873, "learning_rate": 1.8872222222222225e-06, "loss": 0.0015, "num_tokens": 1969382.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 122.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02637072466313839, "kl": 0.012836234644055367, "learning_rate": 1.8866666666666669e-06, "loss": 0.0006, "num_tokens": 1969643.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 122.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.009109367616474628, "kl": 0.0009904690086841583, "learning_rate": 1.8861111111111112e-06, "loss": 0.0, "num_tokens": 1969887.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 122.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.7846505641937256, "kl": 0.005357112968340516, "learning_rate": 1.8855555555555558e-06, "loss": -0.004, "num_tokens": 1970153.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 122.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.8013527393341064, "kl": 0.2079295963048935, "learning_rate": 1.8850000000000002e-06, "loss": -0.0422, "num_tokens": 1970434.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 122.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.12726548314094543, "kl": 0.07423103973269463, "learning_rate": 1.8844444444444445e-06, "loss": 0.0038, "num_tokens": 1970755.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 122.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.013710806146264076, "kl": 0.002015717269387096, "learning_rate": 1.883888888888889e-06, "loss": 0.0001, "num_tokens": 1971025.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 122.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03182021901011467, "kl": 0.006248852238059044, "learning_rate": 1.8833333333333334e-06, "loss": 0.0003, "num_tokens": 1971309.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 122.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001165572612080723, "kl": 1.0579824447631836e-05, "learning_rate": 1.8827777777777778e-06, "loss": 0.0, "num_tokens": 1971529.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 122.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.15759626030921936, "kl": 0.03433808870613575, "learning_rate": 1.8822222222222226e-06, "loss": 0.0018, "num_tokens": 1971861.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 122.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04050665721297264, "kl": 0.20197156025096774, "learning_rate": 1.881666666666667e-06, "loss": 0.0094, "num_tokens": 1972187.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 122.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.3559938669204712, "kl": 0.058913227170705795, "learning_rate": 1.8811111111111113e-06, "loss": -0.1731, "num_tokens": 1972556.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 6615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 122.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0629178062081337, "kl": 0.012283597607165575, "learning_rate": 1.8805555555555556e-06, "loss": 0.0006, "num_tokens": 1972885.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 122.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.025740623474121, "kl": 0.11841679736971855, "learning_rate": 1.8800000000000002e-06, "loss": -0.0227, "num_tokens": 1973254.0, "reward": 6.5, "reward_std": 2.345207929611206, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.345207929611206, "step": 6617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 122.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.010611794888973236, "kl": 0.17232748121023178, "learning_rate": 1.8794444444444446e-06, "loss": 0.0086, "num_tokens": 1973563.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 122.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05587904527783394, "kl": 0.004701100289821625, "learning_rate": 1.878888888888889e-06, "loss": 0.0002, "num_tokens": 1973769.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 122.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.061024174094200134, "kl": 0.07612895965576172, "learning_rate": 1.8783333333333335e-06, "loss": 0.0037, "num_tokens": 1974072.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 122.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04889427497982979, "kl": 0.02171096857637167, "learning_rate": 1.8777777777777778e-06, "loss": 0.0011, "num_tokens": 1974406.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 122.62962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.4822144508361816, "kl": 0.042744599748402834, "learning_rate": 1.8772222222222224e-06, "loss": -0.0181, "num_tokens": 1974685.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 122.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.016110891476273537, "kl": 0.0033987981732934713, "learning_rate": 1.876666666666667e-06, "loss": 0.0002, "num_tokens": 1974987.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 122.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.013269777409732342, "kl": 0.009788049850612879, "learning_rate": 1.8761111111111113e-06, "loss": 0.0005, "num_tokens": 1975259.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 122.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.024631422013044357, "kl": 0.0037627421552315354, "learning_rate": 1.8755555555555557e-06, "loss": 0.0002, "num_tokens": 1975557.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 122.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04983077943325043, "kl": 0.0056130001321434975, "learning_rate": 1.8750000000000003e-06, "loss": 0.0003, "num_tokens": 1975830.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 122.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.023706046864390373, "kl": 0.04484688676893711, "learning_rate": 1.8744444444444446e-06, "loss": 0.0022, "num_tokens": 1976290.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 51.5, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 122.74074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 2.3254554271698, "kl": 0.09070155397057533, "learning_rate": 1.873888888888889e-06, "loss": 0.0573, "num_tokens": 1976720.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 6628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 122.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.00778210861608386, "kl": 0.0015342533588409424, "learning_rate": 1.8733333333333333e-06, "loss": 0.0001, "num_tokens": 1976932.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 122.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.2180583477020264, "kl": 0.1490049585700035, "learning_rate": 1.8727777777777779e-06, "loss": 0.043, "num_tokens": 1977280.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 122.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.014559843577444553, "kl": 0.028116188943386078, "learning_rate": 1.8722222222222225e-06, "loss": 0.0014, "num_tokens": 1977496.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 122.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.039955344051122665, "kl": 0.021780118346214294, "learning_rate": 1.8716666666666668e-06, "loss": 0.0011, "num_tokens": 1977812.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 122.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.10078075528144836, "kl": 0.023699565790593624, "learning_rate": 1.8711111111111114e-06, "loss": 0.0012, "num_tokens": 1978100.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 122.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.017275376245379448, "kl": 0.00041978360241046175, "learning_rate": 1.8705555555555557e-06, "loss": 0.0, "num_tokens": 1978356.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 122.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.3601081371307373, "kl": 0.10883800219744444, "learning_rate": 1.87e-06, "loss": 0.1647, "num_tokens": 1978634.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 122.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06310009956359863, "kl": 0.018722625449299812, "learning_rate": 1.8694444444444447e-06, "loss": 0.0009, "num_tokens": 1978934.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 122.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.06565100699663162, "kl": 0.006445693084970117, "learning_rate": 1.868888888888889e-06, "loss": 0.0003, "num_tokens": 1979190.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 122.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.005780225154012442, "kl": 0.2816808968782425, "learning_rate": 1.8683333333333334e-06, "loss": 0.0141, "num_tokens": 1979478.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 122.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.09082350134849548, "kl": 0.11514962837100029, "learning_rate": 1.8677777777777777e-06, "loss": 0.0058, "num_tokens": 1979803.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 122.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0053114634938538074, "kl": 0.0006694704061374068, "learning_rate": 1.8672222222222225e-06, "loss": 0.0, "num_tokens": 1980063.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 122.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.25009310245513916, "kl": 0.0634799562394619, "learning_rate": 1.8666666666666669e-06, "loss": 0.0031, "num_tokens": 1980447.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 123.0, "frac_reward_zero_std": 0.0, "grad_norm": 5.136460781097412, "kl": 0.02855243720114231, "learning_rate": 1.8661111111111112e-06, "loss": 0.0952, "num_tokens": 1980749.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 123.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.4752219915390015, "kl": 0.3279092609882355, "learning_rate": 1.8655555555555558e-06, "loss": -0.0821, "num_tokens": 1981117.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 123.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.10392206162214279, "kl": 0.03968023136258125, "learning_rate": 1.8650000000000001e-06, "loss": 0.0021, "num_tokens": 1981389.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 123.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.02812691032886505, "kl": 0.0005426406860351562, "learning_rate": 1.8644444444444445e-06, "loss": 0.0, "num_tokens": 1981601.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 123.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0560215599834919, "kl": 0.06642736867070198, "learning_rate": 1.863888888888889e-06, "loss": 0.0033, "num_tokens": 1981899.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 123.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.004409096669405699, "kl": 0.010327324271202087, "learning_rate": 1.8633333333333334e-06, "loss": 0.0005, "num_tokens": 1982135.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 123.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07510194927453995, "kl": 0.17334836721420288, "learning_rate": 1.8627777777777778e-06, "loss": 0.0087, "num_tokens": 1982419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 123.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0337115116417408, "kl": 0.008102408261038363, "learning_rate": 1.8622222222222226e-06, "loss": 0.0004, "num_tokens": 1982728.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 123.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.010902820155024529, "kl": 0.0016487836255691946, "learning_rate": 1.861666666666667e-06, "loss": 0.0001, "num_tokens": 1983049.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 123.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.5838677883148193, "kl": 0.10151666775345802, "learning_rate": 1.8611111111111113e-06, "loss": -0.2132, "num_tokens": 1983418.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 6651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 123.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.05349716916680336, "kl": 0.012978187762200832, "learning_rate": 1.8605555555555556e-06, "loss": 0.0007, "num_tokens": 1983738.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 123.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.04607090726494789, "kl": 0.011022207327187061, "learning_rate": 1.8600000000000002e-06, "loss": 0.0006, "num_tokens": 1984033.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 123.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.2910459637641907, "kl": 0.05800641141831875, "learning_rate": 1.8594444444444445e-06, "loss": 0.003, "num_tokens": 1984353.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 76.25, "completions/mean_terminated_length": 16.33333396911621, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 123.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 3.5413496494293213, "kl": 0.06960074976086617, "learning_rate": 1.858888888888889e-06, "loss": 0.2885, "num_tokens": 1984878.0, "reward": 3.424999952316284, "reward_std": 5.569784164428711, "rewards/reward_combined/mean": 3.424999952316284, "rewards/reward_combined/std": 5.569784641265869, "step": 6655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 123.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.025598231703042984, "kl": 0.009159432724118233, "learning_rate": 1.8583333333333335e-06, "loss": 0.0005, "num_tokens": 1985190.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 123.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04195448011159897, "kl": 0.046219056472182274, "learning_rate": 1.8577777777777778e-06, "loss": 0.0023, "num_tokens": 1985576.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.06386665254831314, "kl": 0.24032461643218994, "learning_rate": 1.8572222222222224e-06, "loss": 0.0122, "num_tokens": 1985861.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.75, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 123.31481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 1.5649887323379517, "kl": 0.04379189759492874, "learning_rate": 1.856666666666667e-06, "loss": 0.1052, "num_tokens": 1986344.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 6659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 123.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.05054464191198349, "kl": 0.004925340414047241, "learning_rate": 1.8561111111111113e-06, "loss": 0.0002, "num_tokens": 1986550.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 123.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.028278756886720657, "kl": 0.002145069927792065, "learning_rate": 1.8555555555555557e-06, "loss": 0.0001, "num_tokens": 1986806.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 123.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05497903749346733, "kl": 0.17599192261695862, "learning_rate": 1.8550000000000002e-06, "loss": 0.0088, "num_tokens": 1987115.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 123.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.9847571849822998, "kl": 0.05518887331709266, "learning_rate": 1.8544444444444446e-06, "loss": 0.5032, "num_tokens": 1987622.0, "reward": 3.375, "reward_std": 5.647639751434326, "rewards/reward_combined/mean": 3.375, "rewards/reward_combined/std": 5.647639751434326, "step": 6663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 123.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.08845123648643494, "kl": 0.12087446823716164, "learning_rate": 1.853888888888889e-06, "loss": 0.0061, "num_tokens": 1987945.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 123.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.025837156921625137, "kl": 0.2407861202955246, "learning_rate": 1.8533333333333333e-06, "loss": 0.012, "num_tokens": 1988245.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 123.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.04656577110290527, "kl": 0.005943537689745426, "learning_rate": 1.852777777777778e-06, "loss": 0.0003, "num_tokens": 1988522.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 123.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029225298203527927, "kl": 0.00042084307642653584, "learning_rate": 1.8522222222222224e-06, "loss": 0.0, "num_tokens": 1988790.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 123.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0773555338382721, "kl": 0.04087892104871571, "learning_rate": 1.8516666666666668e-06, "loss": 0.0025, "num_tokens": 1989093.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 123.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02582254633307457, "kl": 0.0034212273894809186, "learning_rate": 1.8511111111111114e-06, "loss": 0.0002, "num_tokens": 1989383.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 123.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08714812248945236, "kl": 0.07083503156900406, "learning_rate": 1.8505555555555557e-06, "loss": 0.0035, "num_tokens": 1989753.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 123.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.007019422948360443, "kl": 0.0034105145605280995, "learning_rate": 1.85e-06, "loss": 0.0002, "num_tokens": 1990019.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014285714365541935, "clip_ratio/low_min": 0.014285714365541935, "clip_ratio/region_mean": 0.014285714365541935, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 123.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 3.6279568672180176, "kl": 0.2936725467443466, "learning_rate": 1.8494444444444446e-06, "loss": 0.0291, "num_tokens": 1990339.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 1.1959308385849, "kl": 0.1543426951393485, "learning_rate": 1.848888888888889e-06, "loss": 0.0066, "num_tokens": 1990672.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 123.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.04868542030453682, "kl": 0.022923484444618225, "learning_rate": 1.8483333333333334e-06, "loss": 0.0011, "num_tokens": 1991006.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.11467229574918747, "kl": 0.03419129364192486, "learning_rate": 1.8477777777777781e-06, "loss": 0.0017, "num_tokens": 1991296.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 123.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0485171377658844, "kl": 0.005515407770872116, "learning_rate": 1.8472222222222225e-06, "loss": 0.0003, "num_tokens": 1991556.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00961950235068798, "kl": 0.0036433174391277134, "learning_rate": 1.8466666666666668e-06, "loss": 0.0002, "num_tokens": 1991860.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 123.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.07677372545003891, "kl": 0.010104158543981612, "learning_rate": 1.8461111111111112e-06, "loss": 0.0005, "num_tokens": 1992120.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 123.68518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 4.194064617156982, "kl": 0.08962385728955269, "learning_rate": 1.8455555555555558e-06, "loss": 0.0536, "num_tokens": 1992366.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 6679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 123.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.019461870193481445, "kl": 0.008692351169884205, "learning_rate": 1.8450000000000001e-06, "loss": 0.0004, "num_tokens": 1992698.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 123.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01477635744959116, "kl": 0.015273160301148891, "learning_rate": 1.8444444444444445e-06, "loss": 0.0008, "num_tokens": 1992958.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 123.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.19146095216274261, "kl": 0.16447879374027252, "learning_rate": 1.843888888888889e-06, "loss": 0.0083, "num_tokens": 1993297.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 42.75, "completions/mean_terminated_length": 42.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 123.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.34815284609794617, "kl": 0.1392939779907465, "learning_rate": 1.8433333333333334e-06, "loss": 0.007, "num_tokens": 1993692.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 10.536606788635254, "kl": 0.007180021144449711, "learning_rate": 1.842777777777778e-06, "loss": 0.0367, "num_tokens": 1993978.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 123.79629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 1.264427900314331, "kl": 0.5773916440084577, "learning_rate": 1.8422222222222225e-06, "loss": -0.0333, "num_tokens": 1994273.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 6685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 123.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.07679658383131027, "kl": 0.01308439765125513, "learning_rate": 1.8416666666666669e-06, "loss": 0.0007, "num_tokens": 1994546.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 123.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.01784813404083252, "kl": 0.0014159125566948205, "learning_rate": 1.8411111111111112e-06, "loss": 0.0001, "num_tokens": 1994781.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 123.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06931953877210617, "kl": 0.031906431540846825, "learning_rate": 1.8405555555555556e-06, "loss": 0.0016, "num_tokens": 1995057.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 123.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.007615453563630581, "kl": 0.0016071796417236328, "learning_rate": 1.8400000000000002e-06, "loss": 0.0001, "num_tokens": 1995269.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 123.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.007173538208008, "kl": 0.10223019123077393, "learning_rate": 1.8394444444444445e-06, "loss": -0.0759, "num_tokens": 1995687.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 123.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.03913802653551102, "kl": 0.0069810315035283566, "learning_rate": 1.8388888888888889e-06, "loss": 0.0003, "num_tokens": 1995978.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 123.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08417047560214996, "kl": 0.030480634421110153, "learning_rate": 1.8383333333333334e-06, "loss": 0.0016, "num_tokens": 1996205.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 123.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.005285702645778656, "kl": 0.0007913470035418868, "learning_rate": 1.837777777777778e-06, "loss": 0.0, "num_tokens": 1996425.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 123.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011847836140077561, "kl": 1.0341405868530273e-05, "learning_rate": 1.8372222222222224e-06, "loss": 0.0, "num_tokens": 1996645.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 123.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.644125461578369, "kl": 0.0675185564905405, "learning_rate": 1.836666666666667e-06, "loss": -0.045, "num_tokens": 1996938.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 124.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.07363531738519669, "kl": 0.00787548185326159, "learning_rate": 1.8361111111111113e-06, "loss": 0.0004, "num_tokens": 1997200.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 124.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.21688975393772125, "kl": 0.033348968252539635, "learning_rate": 1.8355555555555557e-06, "loss": 0.0018, "num_tokens": 1997566.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 124.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03582407906651497, "kl": 0.005612015724182129, "learning_rate": 1.8350000000000002e-06, "loss": 0.0003, "num_tokens": 1997810.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 124.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.08042564243078232, "kl": 0.017548485193401575, "learning_rate": 1.8344444444444446e-06, "loss": 0.0008, "num_tokens": 1998109.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 124.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06802449375391006, "kl": 0.05856997147202492, "learning_rate": 1.833888888888889e-06, "loss": 0.0029, "num_tokens": 1998581.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 124.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.004097120836377144, "kl": 0.01039646565914154, "learning_rate": 1.8333333333333333e-06, "loss": 0.0005, "num_tokens": 1998817.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 124.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.269119739532471, "kl": 0.17165767401456833, "learning_rate": 1.832777777777778e-06, "loss": 0.0681, "num_tokens": 1999105.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 124.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 3.6063733100891113, "kl": 0.2546239495277405, "learning_rate": 1.8322222222222224e-06, "loss": 0.0121, "num_tokens": 1999435.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 124.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08230458199977875, "kl": 0.020663633942604065, "learning_rate": 1.8316666666666668e-06, "loss": 0.001, "num_tokens": 1999753.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 124.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.886744737625122, "kl": 0.18181107193231583, "learning_rate": 1.8311111111111113e-06, "loss": 0.109, "num_tokens": 2000132.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 124.18518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 5.911078929901123, "kl": 0.05358145263744518, "learning_rate": 1.8305555555555557e-06, "loss": -0.0937, "num_tokens": 2000405.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 6706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 124.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.024491364136338234, "kl": 0.004032726865261793, "learning_rate": 1.83e-06, "loss": 0.0002, "num_tokens": 2000667.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 124.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.007103769574314356, "kl": 0.0017877958598546684, "learning_rate": 1.8294444444444446e-06, "loss": 0.0001, "num_tokens": 2000986.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 124.24074074074075, "frac_reward_zero_std": 0.0, "grad_norm": 4.428089141845703, "kl": 0.2862941697239876, "learning_rate": 1.828888888888889e-06, "loss": 0.02, "num_tokens": 2001289.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 124.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.02889305166900158, "kl": 0.008196088252589107, "learning_rate": 1.8283333333333333e-06, "loss": 0.0004, "num_tokens": 2001595.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008196720853447914, "clip_ratio/low_min": 0.008196720853447914, "clip_ratio/region_mean": 0.008196720853447914, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 124.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.693202018737793, "kl": 0.07555172219872475, "learning_rate": 1.8277777777777781e-06, "loss": -0.0059, "num_tokens": 2001927.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 124.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03514701873064041, "kl": 0.044624462723731995, "learning_rate": 1.8272222222222225e-06, "loss": 0.0022, "num_tokens": 2002311.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 124.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.17907167971134186, "kl": 0.03271032124757767, "learning_rate": 1.8266666666666668e-06, "loss": 0.0016, "num_tokens": 2002605.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 124.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03158054128289223, "kl": 0.0013809417869197205, "learning_rate": 1.8261111111111112e-06, "loss": 0.0001, "num_tokens": 2002862.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 124.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013067753752693534, "kl": 1.2010335922241211e-05, "learning_rate": 1.8255555555555557e-06, "loss": 0.0, "num_tokens": 2003082.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 124.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04472355917096138, "kl": 0.014005882665514946, "learning_rate": 1.825e-06, "loss": 0.0007, "num_tokens": 2003362.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06163159757852554, "kl": 0.022967617958784103, "learning_rate": 1.8244444444444445e-06, "loss": 0.0011, "num_tokens": 2003630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 124.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.957392692565918, "kl": 0.17030832916498184, "learning_rate": 1.823888888888889e-06, "loss": 0.0616, "num_tokens": 2003961.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 124.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03373340889811516, "kl": 0.013833234552294016, "learning_rate": 1.8233333333333334e-06, "loss": 0.0006, "num_tokens": 2004288.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 124.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.016478409990668297, "kl": 0.028451502323150635, "learning_rate": 1.822777777777778e-06, "loss": 0.0014, "num_tokens": 2004504.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 124.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.010306412354111671, "kl": 0.1746811494231224, "learning_rate": 1.8222222222222225e-06, "loss": 0.0087, "num_tokens": 2004812.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 124.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07268873602151871, "kl": 0.006632425356656313, "learning_rate": 1.8216666666666669e-06, "loss": 0.0003, "num_tokens": 2005070.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 124.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.055672161281108856, "kl": 0.00217815104406327, "learning_rate": 1.8211111111111112e-06, "loss": 0.0001, "num_tokens": 2005304.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 124.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 1.500345230102539, "kl": 0.30989227443933487, "learning_rate": 1.8205555555555556e-06, "loss": 0.0153, "num_tokens": 2005680.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 124.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.016046343371272087, "kl": 0.005847167456522584, "learning_rate": 1.8200000000000002e-06, "loss": 0.0003, "num_tokens": 2005964.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 124.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.19281768798828125, "kl": 0.04596136510372162, "learning_rate": 1.8194444444444445e-06, "loss": 0.0024, "num_tokens": 2006243.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 124.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034315555822104216, "kl": 0.2822279781103134, "learning_rate": 1.8188888888888889e-06, "loss": 0.0141, "num_tokens": 2006531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08759946376085281, "kl": 0.009459356078878045, "learning_rate": 1.8183333333333336e-06, "loss": 0.0005, "num_tokens": 2006808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 124.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.29665207862854004, "kl": 0.032065303646959364, "learning_rate": 1.817777777777778e-06, "loss": 0.002, "num_tokens": 2007076.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 124.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.009939088486135006, "kl": 0.001909799873828888, "learning_rate": 1.8172222222222224e-06, "loss": 0.0001, "num_tokens": 2007284.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 124.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.1148409843444824, "kl": 0.24385599046945572, "learning_rate": 1.816666666666667e-06, "loss": 0.0244, "num_tokens": 2007636.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 124.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.21779672801494598, "kl": 0.030477105174213648, "learning_rate": 1.8161111111111113e-06, "loss": 0.0016, "num_tokens": 2007950.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0839671865105629, "kl": 0.025084815453737974, "learning_rate": 1.8155555555555556e-06, "loss": 0.0013, "num_tokens": 2008235.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 124.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.09273037314414978, "kl": 0.020035143941640854, "learning_rate": 1.8150000000000002e-06, "loss": 0.001, "num_tokens": 2008556.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.18739120662212372, "kl": 0.03608038276433945, "learning_rate": 1.8144444444444446e-06, "loss": 0.0018, "num_tokens": 2008825.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 124.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.20995917916297913, "kl": 0.017016611993312836, "learning_rate": 1.813888888888889e-06, "loss": 0.0015, "num_tokens": 2009064.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 124.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.08771708607673645, "kl": 0.020227906294167042, "learning_rate": 1.8133333333333337e-06, "loss": 0.0011, "num_tokens": 2009351.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 124.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.19003821909427643, "kl": 0.02048892481252551, "learning_rate": 1.812777777777778e-06, "loss": 0.001, "num_tokens": 2009619.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.0450611487030983, "kl": 0.010043293237686157, "learning_rate": 1.8122222222222224e-06, "loss": 0.0005, "num_tokens": 2009891.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 124.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.4534687995910645, "kl": 0.18199685215950012, "learning_rate": 1.8116666666666668e-06, "loss": 0.0515, "num_tokens": 2010297.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 124.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.1271682232618332, "kl": 0.08295064838603139, "learning_rate": 1.8111111111111113e-06, "loss": 0.0042, "num_tokens": 2010609.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 124.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 0.904565691947937, "kl": 0.07915183529257774, "learning_rate": 1.8105555555555557e-06, "loss": -0.0131, "num_tokens": 2011042.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 124.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.7335548400878906, "kl": 0.09939774125814438, "learning_rate": 1.81e-06, "loss": 0.0225, "num_tokens": 2011418.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 124.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07447686791419983, "kl": 0.008132794522680342, "learning_rate": 1.8094444444444446e-06, "loss": 0.0004, "num_tokens": 2011706.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 124.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.024255141615867615, "kl": 0.00043610930151771754, "learning_rate": 1.808888888888889e-06, "loss": 0.0, "num_tokens": 2011919.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 124.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.005060485564172268, "kl": 0.0006659924983978271, "learning_rate": 1.8083333333333335e-06, "loss": 0.0, "num_tokens": 2012139.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 124.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.020898843184113503, "kl": 0.011801362037658691, "learning_rate": 1.807777777777778e-06, "loss": 0.0006, "num_tokens": 2012451.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 124.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.018763819709420204, "kl": 0.014488769695162773, "learning_rate": 1.8072222222222224e-06, "loss": 0.0007, "num_tokens": 2012711.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 124.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 31.023378372192383, "kl": 4.525606323033571, "learning_rate": 1.8066666666666668e-06, "loss": 0.1928, "num_tokens": 2013019.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 125.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04695005342364311, "kl": 0.005211193114519119, "learning_rate": 1.8061111111111112e-06, "loss": 0.0003, "num_tokens": 2013279.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 125.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 10.281536102294922, "kl": 0.9278224892914295, "learning_rate": 1.8055555555555557e-06, "loss": 0.0472, "num_tokens": 2013593.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 125.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.021206088364124298, "kl": 0.011664429679512978, "learning_rate": 1.805e-06, "loss": 0.0006, "num_tokens": 2013905.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 125.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 5.405542373657227, "kl": 0.1461472511291504, "learning_rate": 1.8044444444444444e-06, "loss": -0.045, "num_tokens": 2014200.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 125.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.030187292024493217, "kl": 0.012453265488147736, "learning_rate": 1.803888888888889e-06, "loss": 0.0006, "num_tokens": 2014528.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 125.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.30998021364212036, "kl": 0.022797048470238224, "learning_rate": 1.8033333333333336e-06, "loss": 0.0016, "num_tokens": 2014777.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 125.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.18204058706760406, "kl": 0.022254180861636996, "learning_rate": 1.802777777777778e-06, "loss": 0.0011, "num_tokens": 2015042.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 125.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.015506776981055737, "kl": 0.028102673590183258, "learning_rate": 1.8022222222222225e-06, "loss": 0.0014, "num_tokens": 2015258.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 125.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08078572154045105, "kl": 0.009487442846875638, "learning_rate": 1.8016666666666669e-06, "loss": 0.0005, "num_tokens": 2015528.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6758 }, { "clip_ratio/high_max": 0.007246376946568489, "clip_ratio/high_mean": 0.007246376946568489, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007246376946568489, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 125.16666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 2.3639769554138184, "kl": 0.11957107111811638, "learning_rate": 1.8011111111111112e-06, "loss": -0.0592, "num_tokens": 2015897.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 6759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 125.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.10815676301717758, "kl": 0.019950886256992817, "learning_rate": 1.8005555555555556e-06, "loss": 0.001, "num_tokens": 2016229.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 63.75, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 63.75, "completions/mean_terminated_length": 63.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 125.20370370370371, "frac_reward_zero_std": 0.0, "grad_norm": 2.3674588203430176, "kl": 0.08859207108616829, "learning_rate": 1.8000000000000001e-06, "loss": 0.0416, "num_tokens": 2016708.0, "reward": 3.5, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 4.690415859222412, "step": 6761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 125.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.2530622482299805, "kl": 0.07163594523444772, "learning_rate": 1.7994444444444445e-06, "loss": 0.0508, "num_tokens": 2017008.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 125.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.15192607045173645, "kl": 0.04601516481488943, "learning_rate": 1.7988888888888888e-06, "loss": 0.0025, "num_tokens": 2017280.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 125.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.03742431849241257, "kl": 0.05028842203319073, "learning_rate": 1.7983333333333336e-06, "loss": 0.0025, "num_tokens": 2017732.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 125.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.934157609939575, "kl": 0.11600305885076523, "learning_rate": 1.797777777777778e-06, "loss": -0.1096, "num_tokens": 2018137.0, "reward": 6.75, "reward_std": 2.1794495582580566, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.1794495582580566, "step": 6765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 125.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03077859617769718, "kl": 0.0004405632644193247, "learning_rate": 1.7972222222222223e-06, "loss": 0.0, "num_tokens": 2018350.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 125.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.04384357109665871, "kl": 0.01699654757976532, "learning_rate": 1.796666666666667e-06, "loss": 0.0008, "num_tokens": 2018682.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 125.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.04069191962480545, "kl": 0.0025321990251541138, "learning_rate": 1.7961111111111113e-06, "loss": 0.0001, "num_tokens": 2018894.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 125.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.018397608771920204, "kl": 0.13124053925275803, "learning_rate": 1.7955555555555556e-06, "loss": 0.0066, "num_tokens": 2019204.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 125.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03078215755522251, "kl": 0.0048841339303180575, "learning_rate": 1.7950000000000002e-06, "loss": 0.0002, "num_tokens": 2019496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 125.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05747006833553314, "kl": 0.00925510236993432, "learning_rate": 1.7944444444444445e-06, "loss": 0.0005, "num_tokens": 2019780.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 125.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.6363925337791443, "kl": 0.15771708451211452, "learning_rate": 1.7938888888888889e-06, "loss": 0.0079, "num_tokens": 2020097.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 125.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.004988895729184151, "kl": 0.0005555987299885601, "learning_rate": 1.7933333333333337e-06, "loss": 0.0, "num_tokens": 2020317.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 125.44444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 6.172677993774414, "kl": 0.47566666454076767, "learning_rate": 1.792777777777778e-06, "loss": 0.1302, "num_tokens": 2020671.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 6774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 125.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.058652881532907486, "kl": 0.005605354905128479, "learning_rate": 1.7922222222222224e-06, "loss": 0.0003, "num_tokens": 2020931.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 125.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.8964171409606934, "kl": 0.05621919222176075, "learning_rate": 1.7916666666666667e-06, "loss": 0.025, "num_tokens": 2021296.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 125.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.07828111201524734, "kl": 0.017078780569136143, "learning_rate": 1.7911111111111113e-06, "loss": 0.0008, "num_tokens": 2021596.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 125.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009758451022207737, "kl": 0.00024091004888759926, "learning_rate": 1.7905555555555557e-06, "loss": 0.0, "num_tokens": 2021852.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 125.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012215836613904685, "kl": 1.078099012374878e-05, "learning_rate": 1.79e-06, "loss": 0.0, "num_tokens": 2022072.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 125.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 2.6297085285186768, "kl": 0.1752680316567421, "learning_rate": 1.7894444444444446e-06, "loss": 0.0406, "num_tokens": 2022406.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 125.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.004937241319566965, "kl": 0.01015009731054306, "learning_rate": 1.788888888888889e-06, "loss": 0.0005, "num_tokens": 2022642.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 125.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.020818132907152176, "kl": 0.005633218679577112, "learning_rate": 1.7883333333333335e-06, "loss": 0.0003, "num_tokens": 2022946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 125.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.052110206335783005, "kl": 0.009558569174259901, "learning_rate": 1.787777777777778e-06, "loss": 0.0005, "num_tokens": 2023218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 125.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.2122967541217804, "kl": 0.0703795199515298, "learning_rate": 1.7872222222222224e-06, "loss": 0.0035, "num_tokens": 2023527.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 125.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08843677490949631, "kl": 0.021839505061507225, "learning_rate": 1.7866666666666668e-06, "loss": 0.0011, "num_tokens": 2023805.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 125.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.023092173039913177, "kl": 0.011491512414067984, "learning_rate": 1.7861111111111111e-06, "loss": 0.0006, "num_tokens": 2024145.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 125.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.01564987562596798, "kl": 0.2267521545290947, "learning_rate": 1.7855555555555557e-06, "loss": 0.0113, "num_tokens": 2024447.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 125.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.03930981829762459, "kl": 0.003102878457866609, "learning_rate": 1.785e-06, "loss": 0.0002, "num_tokens": 2024701.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 125.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 11.806864738464355, "kl": 0.04142202064394951, "learning_rate": 1.7844444444444444e-06, "loss": 0.2143, "num_tokens": 2024915.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 6789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 125.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.014387194998562336, "kl": 0.01115963701158762, "learning_rate": 1.7838888888888892e-06, "loss": 0.0006, "num_tokens": 2025231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 125.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.01950632408261299, "kl": 0.005717626423574984, "learning_rate": 1.7833333333333336e-06, "loss": 0.0003, "num_tokens": 2025520.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 125.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01030044350773096, "kl": 0.002329520881175995, "learning_rate": 1.782777777777778e-06, "loss": 0.0001, "num_tokens": 2025764.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 125.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.01966673880815506, "kl": 0.002333159325644374, "learning_rate": 1.7822222222222225e-06, "loss": 0.0001, "num_tokens": 2026026.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 125.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.1244720071554184, "kl": 0.020293911918997765, "learning_rate": 1.7816666666666668e-06, "loss": 0.001, "num_tokens": 2026335.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 125.83333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.1638071537017822, "kl": 0.07901446148753166, "learning_rate": 1.7811111111111112e-06, "loss": 0.0261, "num_tokens": 2026710.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 125.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.004005740396678448, "kl": 0.2820390909910202, "learning_rate": 1.7805555555555555e-06, "loss": 0.0141, "num_tokens": 2026998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 125.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02226388268172741, "kl": 0.0020660633454099298, "learning_rate": 1.7800000000000001e-06, "loss": 0.0001, "num_tokens": 2027318.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 125.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.25234511494636536, "kl": 0.17797435075044632, "learning_rate": 1.7794444444444445e-06, "loss": 0.0089, "num_tokens": 2027607.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 125.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07064896076917648, "kl": 0.09398868680000305, "learning_rate": 1.7788888888888892e-06, "loss": 0.0044, "num_tokens": 2028022.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 125.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.019653912633657455, "kl": 0.0008251035469584167, "learning_rate": 1.7783333333333336e-06, "loss": 0.0, "num_tokens": 2028257.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 125.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.020687146112322807, "kl": 0.014057477470487356, "learning_rate": 1.777777777777778e-06, "loss": 0.0007, "num_tokens": 2028517.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 125.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.040709495544433594, "kl": 0.021998626179993153, "learning_rate": 1.7772222222222223e-06, "loss": 0.0012, "num_tokens": 2028807.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 125.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.475203275680542, "kl": 0.11318349465727806, "learning_rate": 1.7766666666666669e-06, "loss": 0.34, "num_tokens": 2029144.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 6803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008775910246185958, "kl": 0.0018717190250754356, "learning_rate": 1.7761111111111112e-06, "loss": 0.0001, "num_tokens": 2029421.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 126.01851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.0665109157562256, "kl": 0.03881406970322132, "learning_rate": 1.7755555555555556e-06, "loss": 0.112, "num_tokens": 2029742.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 76.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 126.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.5928553342819214, "kl": 0.14614778384566307, "learning_rate": 1.7750000000000002e-06, "loss": 0.4498, "num_tokens": 2030266.0, "reward": 5.550000190734863, "reward_std": 4.900000095367432, "rewards/reward_combined/mean": 5.550000190734863, "rewards/reward_combined/std": 4.900000095367432, "step": 6806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 126.05555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.2372352033853531, "kl": 0.03672488604206592, "learning_rate": 1.7744444444444445e-06, "loss": 0.0024, "num_tokens": 2030538.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 126.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.025178473442792892, "kl": 0.0023085951106622815, "learning_rate": 1.773888888888889e-06, "loss": 0.0001, "num_tokens": 2030860.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 126.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013242023123893887, "kl": 1.2584030628204346e-05, "learning_rate": 1.7733333333333336e-06, "loss": 0.0, "num_tokens": 2031080.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 126.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.019686048850417137, "kl": 0.027903071604669094, "learning_rate": 1.772777777777778e-06, "loss": 0.0014, "num_tokens": 2031299.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 126.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006729425513185561, "kl": 0.0017719268798828125, "learning_rate": 1.7722222222222224e-06, "loss": 0.0001, "num_tokens": 2031579.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 126.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03211108222603798, "kl": 0.1462779939174652, "learning_rate": 1.7716666666666667e-06, "loss": 0.0073, "num_tokens": 2031887.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 126.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.007450329605489969, "kl": 0.0014159679412841797, "learning_rate": 1.7711111111111113e-06, "loss": 0.0001, "num_tokens": 2032099.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 126.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.022577151656150818, "kl": 0.011136623099446297, "learning_rate": 1.7705555555555556e-06, "loss": 0.0006, "num_tokens": 2032411.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 126.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.02398080751299858, "kl": 0.04640117287635803, "learning_rate": 1.77e-06, "loss": 0.0023, "num_tokens": 2032863.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 126.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.020874904468655586, "kl": 0.007939355447888374, "learning_rate": 1.7694444444444446e-06, "loss": 0.0004, "num_tokens": 2033151.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 126.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.07880879193544388, "kl": 0.08514131419360638, "learning_rate": 1.7688888888888891e-06, "loss": 0.0041, "num_tokens": 2033456.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 126.25925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04106632247567177, "kl": 0.01017641182988882, "learning_rate": 1.7683333333333335e-06, "loss": 0.0005, "num_tokens": 2033738.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 126.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.004959405865520239, "kl": 0.010140910744667053, "learning_rate": 1.767777777777778e-06, "loss": 0.0005, "num_tokens": 2033974.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.07171984016895294, "kl": 0.012121236883103848, "learning_rate": 1.7672222222222224e-06, "loss": 0.0006, "num_tokens": 2034233.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 126.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.06305581331253052, "kl": 0.017260676249861717, "learning_rate": 1.7666666666666668e-06, "loss": 0.0008, "num_tokens": 2034532.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 126.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.13548287749290466, "kl": 0.04294486157596111, "learning_rate": 1.7661111111111111e-06, "loss": 0.002, "num_tokens": 2034913.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 126.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09579591453075409, "kl": 0.025634801015257835, "learning_rate": 1.7655555555555557e-06, "loss": 0.0013, "num_tokens": 2035209.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 126.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.474160760641098, "kl": 0.15562308579683304, "learning_rate": 1.765e-06, "loss": 0.0078, "num_tokens": 2035567.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 126.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02957078255712986, "kl": 0.00132059893803671, "learning_rate": 1.7644444444444444e-06, "loss": 0.0001, "num_tokens": 2035837.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 126.4074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.005875066854059696, "kl": 0.0004861965135205537, "learning_rate": 1.7638888888888892e-06, "loss": 0.0, "num_tokens": 2036071.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 126.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.084293842315674, "kl": 0.1225525550544262, "learning_rate": 1.7633333333333335e-06, "loss": -0.0797, "num_tokens": 2036391.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 126.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03457742556929588, "kl": 0.006350204814225435, "learning_rate": 1.7627777777777779e-06, "loss": 0.0003, "num_tokens": 2036697.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 5.666377544403076, "kl": 0.03588703088462353, "learning_rate": 1.7622222222222225e-06, "loss": 0.0063, "num_tokens": 2036963.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 126.48148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.015639694407582283, "kl": 0.005213691620156169, "learning_rate": 1.7616666666666668e-06, "loss": 0.0003, "num_tokens": 2037247.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 126.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03726252540946007, "kl": 0.0008904486894607544, "learning_rate": 1.7611111111111112e-06, "loss": 0.0, "num_tokens": 2037459.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 126.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.021301433444023132, "kl": 0.016551862936466932, "learning_rate": 1.7605555555555557e-06, "loss": 0.0008, "num_tokens": 2037791.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 126.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.010407980531454086, "kl": 0.0024238228797912598, "learning_rate": 1.76e-06, "loss": 0.0001, "num_tokens": 2038035.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 126.55555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.062057375907898, "kl": 0.07389170303940773, "learning_rate": 1.7594444444444444e-06, "loss": 0.015, "num_tokens": 2038481.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 126.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.021276036277413368, "kl": 0.25280025601387024, "learning_rate": 1.7588888888888892e-06, "loss": 0.0126, "num_tokens": 2038779.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.06127919629216194, "kl": 0.016417236998677254, "learning_rate": 1.7583333333333336e-06, "loss": 0.0008, "num_tokens": 2039047.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 126.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.17064251005649567, "kl": 0.01699390448629856, "learning_rate": 1.757777777777778e-06, "loss": 0.0009, "num_tokens": 2039317.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 126.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.12190265953540802, "kl": 0.0237678331322968, "learning_rate": 1.7572222222222223e-06, "loss": 0.0012, "num_tokens": 2039660.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 126.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04838434234261513, "kl": 0.002945195185020566, "learning_rate": 1.7566666666666669e-06, "loss": 0.0001, "num_tokens": 2039916.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 126.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03550216928124428, "kl": 0.0764811597764492, "learning_rate": 1.7561111111111112e-06, "loss": 0.0038, "num_tokens": 2040273.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 126.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.3027445375919342, "kl": 0.043884921818971634, "learning_rate": 1.7555555555555556e-06, "loss": 0.0022, "num_tokens": 2040607.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 126.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.06751609593629837, "kl": 0.017577162478119135, "learning_rate": 1.7550000000000001e-06, "loss": 0.0009, "num_tokens": 2040943.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08687615394592285, "kl": 0.011726762168109417, "learning_rate": 1.7544444444444445e-06, "loss": 0.0007, "num_tokens": 2041251.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 126.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.3089616596698761, "kl": 0.06447676755487919, "learning_rate": 1.753888888888889e-06, "loss": 0.0036, "num_tokens": 2041630.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 126.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.018121792003512383, "kl": 0.011236317921429873, "learning_rate": 1.7533333333333336e-06, "loss": 0.0006, "num_tokens": 2041946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6845 }, { "clip_ratio/high_max": 0.00909090880304575, "clip_ratio/high_mean": 0.00909090880304575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00909090880304575, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 126.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.2371745109558105, "kl": 0.08289579674601555, "learning_rate": 1.752777777777778e-06, "loss": 0.0869, "num_tokens": 2042371.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 6846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.012355736456811428, "kl": 0.008176295319572091, "learning_rate": 1.7522222222222223e-06, "loss": 0.0004, "num_tokens": 2042644.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 126.81481481481481, "frac_reward_zero_std": 0.0, "grad_norm": 2.5104241371154785, "kl": 0.16282328963279724, "learning_rate": 1.7516666666666667e-06, "loss": 0.019, "num_tokens": 2042929.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 126.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.004959432873874903, "kl": 0.000623619562247768, "learning_rate": 1.7511111111111113e-06, "loss": 0.0, "num_tokens": 2043149.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 126.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.021398227661848068, "kl": 0.013925721403211355, "learning_rate": 1.7505555555555556e-06, "loss": 0.0007, "num_tokens": 2043409.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 126.87037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0924307331442833, "kl": 0.13965632021427155, "learning_rate": 1.75e-06, "loss": 0.007, "num_tokens": 2043743.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 126.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.943894863128662, "kl": 0.09565679580555297, "learning_rate": 1.7494444444444448e-06, "loss": 0.1628, "num_tokens": 2044013.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 6852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 126.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.02977580577135086, "kl": 0.0004369795206002891, "learning_rate": 1.7488888888888891e-06, "loss": 0.0, "num_tokens": 2044269.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 126.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06817903369665146, "kl": 0.005160473985597491, "learning_rate": 1.7483333333333335e-06, "loss": 0.0003, "num_tokens": 2044553.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 126.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.004008350893855095, "kl": 0.282049298286438, "learning_rate": 1.747777777777778e-06, "loss": 0.0141, "num_tokens": 2044841.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 126.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.7068326473236084, "kl": 0.31227077916264534, "learning_rate": 1.7472222222222224e-06, "loss": 0.0156, "num_tokens": 2045101.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 6856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 126.98148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.7092843055725098, "kl": 0.1845817193388939, "learning_rate": 1.7466666666666667e-06, "loss": 0.1326, "num_tokens": 2045456.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 6857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 127.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.02755337953567505, "kl": 0.003378540277481079, "learning_rate": 1.746111111111111e-06, "loss": 0.0001, "num_tokens": 2045666.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 127.01851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07771772146224976, "kl": 0.009111419960390776, "learning_rate": 1.7455555555555557e-06, "loss": 0.0005, "num_tokens": 2045938.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 127.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04103042930364609, "kl": 0.0033672231948003173, "learning_rate": 1.745e-06, "loss": 0.0002, "num_tokens": 2046196.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 127.05555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.8999254703521729, "kl": 0.13181667029857635, "learning_rate": 1.7444444444444448e-06, "loss": 0.042, "num_tokens": 2046573.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 6861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 127.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04136445000767708, "kl": 0.009334301110357046, "learning_rate": 1.7438888888888892e-06, "loss": 0.0005, "num_tokens": 2046881.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 127.0925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0429098904132843, "kl": 0.1815880462527275, "learning_rate": 1.7433333333333335e-06, "loss": 0.0091, "num_tokens": 2047165.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 127.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.010144614614546299, "kl": 0.002692032605409622, "learning_rate": 1.7427777777777779e-06, "loss": 0.0001, "num_tokens": 2047409.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 127.12962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.026478635147213936, "kl": 0.00548108946532011, "learning_rate": 1.7422222222222224e-06, "loss": 0.0003, "num_tokens": 2047693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 127.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.2724255919456482, "kl": 0.07417556643486023, "learning_rate": 1.7416666666666668e-06, "loss": 0.0036, "num_tokens": 2048049.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 127.16666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.015704995021224022, "kl": 0.00028780996944988146, "learning_rate": 1.7411111111111111e-06, "loss": 0.0, "num_tokens": 2048305.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 127.18518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.04594772681593895, "kl": 0.010061865672469139, "learning_rate": 1.7405555555555557e-06, "loss": 0.0005, "num_tokens": 2048577.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 127.20370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0236633513122797, "kl": 0.0134973987005651, "learning_rate": 1.74e-06, "loss": 0.0007, "num_tokens": 2048913.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 127.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.026063354685902596, "kl": 0.022107355296611786, "learning_rate": 1.7394444444444446e-06, "loss": 0.0011, "num_tokens": 2049272.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 127.24074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007950562867335975, "kl": 0.001789235626347363, "learning_rate": 1.7388888888888892e-06, "loss": 0.0001, "num_tokens": 2049552.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 127.25925925925925, "frac_reward_zero_std": 0.0, "grad_norm": 3.1527042388916016, "kl": 0.8143503367900848, "learning_rate": 1.7383333333333336e-06, "loss": -0.035, "num_tokens": 2049839.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 6872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 127.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.012201914563775063, "kl": 0.0035354513092897832, "learning_rate": 1.737777777777778e-06, "loss": 0.0002, "num_tokens": 2050105.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 127.29629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.015308109112083912, "kl": 0.0015829511103220284, "learning_rate": 1.7372222222222223e-06, "loss": 0.0001, "num_tokens": 2050395.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 127.31481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038729459047317505, "kl": 0.2820817232131958, "learning_rate": 1.7366666666666668e-06, "loss": 0.0141, "num_tokens": 2050683.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 127.33333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03967523202300072, "kl": 0.011493564583361149, "learning_rate": 1.7361111111111112e-06, "loss": 0.0006, "num_tokens": 2051014.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 127.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08457355946302414, "kl": 0.057712242007255554, "learning_rate": 1.7355555555555555e-06, "loss": 0.0028, "num_tokens": 2051472.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 127.37037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.35766780376434326, "kl": 0.053338754922151566, "learning_rate": 1.7350000000000001e-06, "loss": 0.0029, "num_tokens": 2051831.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.75, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 127.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.8751410245895386, "kl": 0.20407681167125702, "learning_rate": 1.7344444444444447e-06, "loss": 0.0111, "num_tokens": 2052222.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 6879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 127.4074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 2.0829389095306396, "kl": 0.18563345074653625, "learning_rate": 1.733888888888889e-06, "loss": 0.0099, "num_tokens": 2052557.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 6880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 127.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03196534141898155, "kl": 0.0109897805377841, "learning_rate": 1.7333333333333336e-06, "loss": 0.0005, "num_tokens": 2052863.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 127.44444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.15042755007743835, "kl": 0.05621676193550229, "learning_rate": 1.732777777777778e-06, "loss": 0.0028, "num_tokens": 2053165.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 127.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08291668444871902, "kl": 0.005565872794250026, "learning_rate": 1.7322222222222223e-06, "loss": 0.0003, "num_tokens": 2053385.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009433962404727936, "clip_ratio/low_min": 0.009433962404727936, "clip_ratio/region_mean": 0.009433962404727936, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 127.48148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.987447738647461, "kl": 0.04249482788145542, "learning_rate": 1.7316666666666667e-06, "loss": 0.1375, "num_tokens": 2053728.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 6884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 127.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.4286091327667236, "kl": 0.19172262027859688, "learning_rate": 1.7311111111111112e-06, "loss": 0.0548, "num_tokens": 2054071.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 127.51851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08625127375125885, "kl": 0.0064963772892951965, "learning_rate": 1.7305555555555556e-06, "loss": 0.0003, "num_tokens": 2054331.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 127.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02169019542634487, "kl": 0.01142323948442936, "learning_rate": 1.73e-06, "loss": 0.0006, "num_tokens": 2054643.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 127.55555555555556, "frac_reward_zero_std": 1.0, "grad_norm": 0.11134855449199677, "kl": 0.006786237005144358, "learning_rate": 1.7294444444444447e-06, "loss": 0.0005, "num_tokens": 2054859.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 127.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 6.778787612915039, "kl": 0.02135028876364231, "learning_rate": 1.728888888888889e-06, "loss": 0.0771, "num_tokens": 2055122.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 127.5925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.08354557305574417, "kl": 0.01072307676076889, "learning_rate": 1.7283333333333334e-06, "loss": 0.0005, "num_tokens": 2055381.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 127.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 5.762946128845215, "kl": 0.2530154511332512, "learning_rate": 1.727777777777778e-06, "loss": 0.1158, "num_tokens": 2055593.0, "reward": 2.0, "reward_std": 3.0, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 3.0, "step": 6891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 127.62962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.024058017879724503, "kl": 0.011032984068151563, "learning_rate": 1.7272222222222224e-06, "loss": 0.0006, "num_tokens": 2055879.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 127.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03255914896726608, "kl": 0.009880050085484982, "learning_rate": 1.7266666666666667e-06, "loss": 0.0005, "num_tokens": 2056197.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 127.66666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03886338323354721, "kl": 0.13087694346904755, "learning_rate": 1.726111111111111e-06, "loss": 0.0065, "num_tokens": 2056553.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 127.68518518518519, "frac_reward_zero_std": 1.0, "grad_norm": 0.7883631587028503, "kl": 0.141303863376379, "learning_rate": 1.7255555555555556e-06, "loss": 0.0061, "num_tokens": 2056820.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 127.70370370370371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0641993060708046, "kl": 0.021962422877550125, "learning_rate": 1.725e-06, "loss": 0.0011, "num_tokens": 2057117.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 127.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04212106391787529, "kl": 0.2505946531891823, "learning_rate": 1.7244444444444448e-06, "loss": 0.0125, "num_tokens": 2057416.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 127.74074074074075, "frac_reward_zero_std": 1.0, "grad_norm": 0.014317400753498077, "kl": 0.028141051530838013, "learning_rate": 1.7238888888888891e-06, "loss": 0.0014, "num_tokens": 2057632.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 127.75925925925925, "frac_reward_zero_std": 1.0, "grad_norm": 0.13981042802333832, "kl": 0.022394862957298756, "learning_rate": 1.7233333333333335e-06, "loss": 0.0012, "num_tokens": 2057900.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 127.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03729470446705818, "kl": 0.08320135250687599, "learning_rate": 1.7227777777777778e-06, "loss": 0.004, "num_tokens": 2058333.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 127.79629629629629, "frac_reward_zero_std": 1.0, "grad_norm": 0.005598236806690693, "kl": 0.0015855496167205274, "learning_rate": 1.7222222222222224e-06, "loss": 0.0001, "num_tokens": 2058650.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 127.81481481481481, "frac_reward_zero_std": 1.0, "grad_norm": 0.020556630566716194, "kl": 0.00891830399632454, "learning_rate": 1.7216666666666668e-06, "loss": 0.0005, "num_tokens": 2058932.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 127.83333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.06277669966220856, "kl": 0.06663670018315315, "learning_rate": 1.7211111111111111e-06, "loss": 0.0033, "num_tokens": 2059249.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 127.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0065288483165204525, "kl": 0.0029946829890832305, "learning_rate": 1.7205555555555557e-06, "loss": 0.0001, "num_tokens": 2059561.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 127.87037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 7.233670711517334, "kl": 0.017525983974337578, "learning_rate": 1.72e-06, "loss": 0.3847, "num_tokens": 2059797.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 6905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 127.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.0135765075683594, "kl": 0.047845881432294846, "learning_rate": 1.7194444444444446e-06, "loss": -0.1039, "num_tokens": 2060107.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 6906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 127.9074074074074, "frac_reward_zero_std": 1.0, "grad_norm": 0.010433281771838665, "kl": 0.0004255466483300552, "learning_rate": 1.7188888888888892e-06, "loss": 0.0, "num_tokens": 2060342.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 127.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012508024519775063, "kl": 1.1272728443145752e-05, "learning_rate": 1.7183333333333335e-06, "loss": 0.0, "num_tokens": 2060562.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 127.94444444444444, "frac_reward_zero_std": 1.0, "grad_norm": 0.3391644060611725, "kl": 0.05276930780382827, "learning_rate": 1.717777777777778e-06, "loss": 0.0027, "num_tokens": 2060826.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 127.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03665275126695633, "kl": 0.17515547573566437, "learning_rate": 1.7172222222222223e-06, "loss": 0.0088, "num_tokens": 2061135.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 127.98148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004276719875633717, "kl": 0.010388650000095367, "learning_rate": 1.7166666666666668e-06, "loss": 0.0005, "num_tokens": 2061371.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08580157905817032, "kl": 0.035604508593678474, "learning_rate": 1.7161111111111112e-06, "loss": 0.0019, "num_tokens": 2061642.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 128.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09078051894903183, "kl": 0.027594943530857563, "learning_rate": 1.7155555555555555e-06, "loss": 0.0014, "num_tokens": 2061992.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 128.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01894962415099144, "kl": 0.001962851150892675, "learning_rate": 1.7150000000000003e-06, "loss": 0.0001, "num_tokens": 2062314.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 128.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03516704961657524, "kl": 0.005703192204236984, "learning_rate": 1.7144444444444447e-06, "loss": 0.0003, "num_tokens": 2062558.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.13221979141235352, "kl": 0.028607314452528954, "learning_rate": 1.713888888888889e-06, "loss": 0.0015, "num_tokens": 2062831.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 128.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.003938273061066866, "kl": 0.0005194604455027729, "learning_rate": 1.7133333333333336e-06, "loss": 0.0, "num_tokens": 2063091.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 128.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1706613302230835, "kl": 0.09097743779420853, "learning_rate": 1.712777777777778e-06, "loss": 0.0046, "num_tokens": 2063461.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 128.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03364564850926399, "kl": 0.011922517325729132, "learning_rate": 1.7122222222222223e-06, "loss": 0.0006, "num_tokens": 2063749.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 128.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012901022273581475, "kl": 1.1868774890899658e-05, "learning_rate": 1.7116666666666667e-06, "loss": 0.0, "num_tokens": 2063969.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 128.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.02396240085363388, "kl": 0.0075617386028170586, "learning_rate": 1.7111111111111112e-06, "loss": 0.0004, "num_tokens": 2064300.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.201509952545166, "kl": 0.2509969547390938, "learning_rate": 1.7105555555555556e-06, "loss": -0.069, "num_tokens": 2064585.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 6922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 128.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.12443941086530685, "kl": 0.14510150253772736, "learning_rate": 1.7100000000000004e-06, "loss": 0.0073, "num_tokens": 2064931.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.75, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 128.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.5169034004211426, "kl": 0.11073892191052437, "learning_rate": 1.7094444444444447e-06, "loss": 0.0058, "num_tokens": 2065330.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 128.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.09506568312644958, "kl": 0.2363072633743286, "learning_rate": 1.708888888888889e-06, "loss": 0.0118, "num_tokens": 2065632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 128.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.030772222205996513, "kl": 0.010267202276736498, "learning_rate": 1.7083333333333334e-06, "loss": 0.0005, "num_tokens": 2065916.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.1545897126197815, "kl": 0.014528238214552402, "learning_rate": 1.707777777777778e-06, "loss": 0.0007, "num_tokens": 2066196.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10985057055950165, "kl": 0.053186945617198944, "learning_rate": 1.7072222222222223e-06, "loss": 0.0029, "num_tokens": 2066471.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 128.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.057306401431560516, "kl": 0.23513171821832657, "learning_rate": 1.7066666666666667e-06, "loss": 0.0118, "num_tokens": 2066800.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 128.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.00423731142655015, "kl": 0.000380516066798009, "learning_rate": 1.706111111111111e-06, "loss": 0.0, "num_tokens": 2067020.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 128.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02369505539536476, "kl": 0.0032640923745930195, "learning_rate": 1.7055555555555556e-06, "loss": 0.0002, "num_tokens": 2067284.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.733614981174469, "kl": 0.09401548281311989, "learning_rate": 1.7050000000000002e-06, "loss": 0.005, "num_tokens": 2067553.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 128.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07995583117008209, "kl": 0.06322385743260384, "learning_rate": 1.7044444444444448e-06, "loss": 0.0032, "num_tokens": 2067914.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.06576652079820633, "kl": 0.009501576889306307, "learning_rate": 1.7038888888888891e-06, "loss": 0.0005, "num_tokens": 2068214.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 128.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02529844641685486, "kl": 0.011380805168300867, "learning_rate": 1.7033333333333335e-06, "loss": 0.0006, "num_tokens": 2068530.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 128.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.002065822947770357, "kl": 0.00019543171219993383, "learning_rate": 1.7027777777777778e-06, "loss": 0.0, "num_tokens": 2068786.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 128.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.003988214768469334, "kl": 0.01046241819858551, "learning_rate": 1.7022222222222224e-06, "loss": 0.0005, "num_tokens": 2069022.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 128.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011008348315954208, "kl": 0.0022738593979738653, "learning_rate": 1.7016666666666668e-06, "loss": 0.0001, "num_tokens": 2069299.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 128.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03487025573849678, "kl": 0.011679781135171652, "learning_rate": 1.7011111111111111e-06, "loss": 0.0006, "num_tokens": 2069602.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 128.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.13369712233543396, "kl": 0.06630091741681099, "learning_rate": 1.7005555555555557e-06, "loss": 0.0034, "num_tokens": 2069968.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 128.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.014761483296751976, "kl": 0.02830091118812561, "learning_rate": 1.7000000000000002e-06, "loss": 0.0014, "num_tokens": 2070184.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 128.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.05422572046518326, "kl": 0.0056973472237586975, "learning_rate": 1.6994444444444446e-06, "loss": 0.0003, "num_tokens": 2070444.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 128.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.5965837836265564, "kl": 0.22845587506890297, "learning_rate": 1.6988888888888892e-06, "loss": 0.0113, "num_tokens": 2070879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 128.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.09165317565202713, "kl": 0.009825895074754953, "learning_rate": 1.6983333333333335e-06, "loss": 0.0005, "num_tokens": 2071193.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 128.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.15831559896469116, "kl": 0.05637744814157486, "learning_rate": 1.6977777777777779e-06, "loss": 0.0028, "num_tokens": 2071497.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 128.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019767603371292353, "kl": 0.2825882136821747, "learning_rate": 1.6972222222222222e-06, "loss": 0.0141, "num_tokens": 2071785.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 128.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.022673996165394783, "kl": 0.04744488000869751, "learning_rate": 1.6966666666666668e-06, "loss": 0.0024, "num_tokens": 2072245.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 128.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 4.094484806060791, "kl": 0.20300962030887604, "learning_rate": 1.6961111111111112e-06, "loss": 0.1508, "num_tokens": 2072585.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 6948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 128.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890169024467468, "kl": 0.028811444528400898, "learning_rate": 1.6955555555555555e-06, "loss": 0.0016, "num_tokens": 2072872.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 128.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04091022536158562, "kl": 0.1759033128619194, "learning_rate": 1.6950000000000003e-06, "loss": 0.0088, "num_tokens": 2073181.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 128.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.72373104095459, "kl": 0.032548592425882816, "learning_rate": 1.6944444444444446e-06, "loss": 0.1333, "num_tokens": 2073532.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 6951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 128.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.025822332128882408, "kl": 0.020225325133651495, "learning_rate": 1.693888888888889e-06, "loss": 0.001, "num_tokens": 2073821.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 128.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.048724520951509476, "kl": 0.004470242420211434, "learning_rate": 1.6933333333333336e-06, "loss": 0.0002, "num_tokens": 2074077.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 128.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.007398621179163456, "kl": 0.001859322190284729, "learning_rate": 1.692777777777778e-06, "loss": 0.0001, "num_tokens": 2074289.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 128.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07567547261714935, "kl": 0.008243201649747789, "learning_rate": 1.6922222222222223e-06, "loss": 0.0004, "num_tokens": 2074559.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 128.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.9544448852539062, "kl": 0.544110544025898, "learning_rate": 1.6916666666666666e-06, "loss": 0.1855, "num_tokens": 2074920.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 6956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 128.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0988290086388588, "kl": 0.005158553831279278, "learning_rate": 1.6911111111111112e-06, "loss": 0.0003, "num_tokens": 2075133.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 128.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.022957822307944298, "kl": 0.011062758043408394, "learning_rate": 1.6905555555555556e-06, "loss": 0.0006, "num_tokens": 2075445.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 128.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 10.979584693908691, "kl": 0.06832090020179749, "learning_rate": 1.6900000000000003e-06, "loss": 0.2441, "num_tokens": 2075655.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 6959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 128.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07112076133489609, "kl": 0.0055224953684955835, "learning_rate": 1.6894444444444447e-06, "loss": 0.0003, "num_tokens": 2075923.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 128.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.3434562087059021, "kl": 0.0534632820636034, "learning_rate": 1.688888888888889e-06, "loss": 0.0028, "num_tokens": 2076227.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 128.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.64609432220459, "kl": 0.07712111994624138, "learning_rate": 1.6883333333333334e-06, "loss": 0.1758, "num_tokens": 2076634.0, "reward": 5.625, "reward_std": 4.422951698303223, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 4.422951698303223, "step": 6962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 128.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.005248466040939093, "kl": 0.017398105934262276, "learning_rate": 1.687777777777778e-06, "loss": 0.0009, "num_tokens": 2076894.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 128.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.14780618250370026, "kl": 0.054358480498194695, "learning_rate": 1.6872222222222223e-06, "loss": 0.0027, "num_tokens": 2077228.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 128.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02054859884083271, "kl": 0.0010908643016591668, "learning_rate": 1.6866666666666667e-06, "loss": 0.0001, "num_tokens": 2077462.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 129.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.07748699188232422, "kl": 0.019365277606993914, "learning_rate": 1.686111111111111e-06, "loss": 0.001, "num_tokens": 2077754.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 129.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07084181904792786, "kl": 0.005716795567423105, "learning_rate": 1.6855555555555556e-06, "loss": 0.0003, "num_tokens": 2078014.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 129.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.10928159952163696, "kl": 0.011138736037537456, "learning_rate": 1.6850000000000002e-06, "loss": 0.0006, "num_tokens": 2078283.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 129.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.09296828508377075, "kl": 0.015490835066884756, "learning_rate": 1.6844444444444447e-06, "loss": 0.0008, "num_tokens": 2078611.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 129.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.2339669018983841, "kl": 0.031878734938800335, "learning_rate": 1.683888888888889e-06, "loss": 0.0015, "num_tokens": 2078889.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 129.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02080053649842739, "kl": 0.0034809797070920467, "learning_rate": 1.6833333333333335e-06, "loss": 0.0002, "num_tokens": 2079151.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 129.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021112377289682627, "kl": 0.28255516290664673, "learning_rate": 1.6827777777777778e-06, "loss": 0.0141, "num_tokens": 2079439.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 129.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.8006370067596436, "kl": 0.415493693202734, "learning_rate": 1.6822222222222224e-06, "loss": 0.0836, "num_tokens": 2079804.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 6973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 129.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10327623784542084, "kl": 0.018482052721083164, "learning_rate": 1.6816666666666667e-06, "loss": 0.0011, "num_tokens": 2080096.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 129.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.008295697160065174, "kl": 0.0006153732538223267, "learning_rate": 1.681111111111111e-06, "loss": 0.0, "num_tokens": 2080308.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 129.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07320208847522736, "kl": 0.012533016502857208, "learning_rate": 1.6805555555555559e-06, "loss": 0.0005, "num_tokens": 2080516.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 6976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 129.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.873715877532959, "kl": 0.15411778911948204, "learning_rate": 1.6800000000000002e-06, "loss": 0.1397, "num_tokens": 2080867.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 6977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 129.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 5.280453681945801, "kl": 0.11152469739317894, "learning_rate": 1.6794444444444446e-06, "loss": -0.0529, "num_tokens": 2081194.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 6978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 129.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.022119611501693726, "kl": 0.0014724376669619232, "learning_rate": 1.6788888888888891e-06, "loss": 0.0001, "num_tokens": 2081428.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 129.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.12645700573921204, "kl": 0.16511936485767365, "learning_rate": 1.6783333333333335e-06, "loss": 0.0084, "num_tokens": 2081739.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 129.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 0.4928673505783081, "kl": 0.611974997445941, "learning_rate": 1.6777777777777779e-06, "loss": 0.0363, "num_tokens": 2082033.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 6981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 129.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03181536868214607, "kl": 0.010435118805617094, "learning_rate": 1.6772222222222222e-06, "loss": 0.0005, "num_tokens": 2082321.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 129.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03610335290431976, "kl": 0.08882573992013931, "learning_rate": 1.6766666666666668e-06, "loss": 0.0044, "num_tokens": 2082733.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 129.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01908222772181034, "kl": 0.00251063727773726, "learning_rate": 1.6761111111111111e-06, "loss": 0.0001, "num_tokens": 2083012.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 129.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.21023736894130707, "kl": 0.07514866068959236, "learning_rate": 1.675555555555556e-06, "loss": 0.0037, "num_tokens": 2083295.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 129.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010807731450768188, "kl": 9.410083293914795e-06, "learning_rate": 1.6750000000000003e-06, "loss": 0.0, "num_tokens": 2083515.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 6986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 129.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.13648362457752228, "kl": 0.03400488942861557, "learning_rate": 1.6744444444444446e-06, "loss": 0.0016, "num_tokens": 2083814.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 129.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03812921792268753, "kl": 0.005781574407592416, "learning_rate": 1.673888888888889e-06, "loss": 0.0003, "num_tokens": 2084126.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 129.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.005713712424039841, "kl": 0.017279723659157753, "learning_rate": 1.6733333333333335e-06, "loss": 0.0009, "num_tokens": 2084386.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 129.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.030820447951555252, "kl": 0.0020465850830078125, "learning_rate": 1.672777777777778e-06, "loss": 0.0001, "num_tokens": 2084604.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 129.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.008384697139263153, "kl": 0.0016524687525816262, "learning_rate": 1.6722222222222223e-06, "loss": 0.0001, "num_tokens": 2084922.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 6991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 129.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04821626469492912, "kl": 0.05514254793524742, "learning_rate": 1.6716666666666666e-06, "loss": 0.0026, "num_tokens": 2085246.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 129.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04544272646307945, "kl": 0.013388224877417088, "learning_rate": 1.6711111111111112e-06, "loss": 0.0007, "num_tokens": 2085564.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007462686393409967, "clip_ratio/low_min": 0.007462686393409967, "clip_ratio/region_mean": 0.007462686393409967, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 129.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.506405830383301, "kl": 0.19431142136454582, "learning_rate": 1.6705555555555557e-06, "loss": 0.1806, "num_tokens": 2085906.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 6994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 129.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03896956890821457, "kl": 0.06207462027668953, "learning_rate": 1.6700000000000003e-06, "loss": 0.0031, "num_tokens": 2086223.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 129.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.09787684679031372, "kl": 0.023340976797044277, "learning_rate": 1.6694444444444447e-06, "loss": 0.0012, "num_tokens": 2086498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 129.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.030955269932746887, "kl": 0.02873118966817856, "learning_rate": 1.668888888888889e-06, "loss": 0.0014, "num_tokens": 2086714.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 129.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.18719349801540375, "kl": 0.017637168988585472, "learning_rate": 1.6683333333333334e-06, "loss": 0.0008, "num_tokens": 2086957.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 6998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 129.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03746271878480911, "kl": 0.004770450759679079, "learning_rate": 1.667777777777778e-06, "loss": 0.0002, "num_tokens": 2087241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 6999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 129.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.497762680053711, "kl": 0.054524714476428926, "learning_rate": 1.6672222222222223e-06, "loss": 0.0014, "num_tokens": 2087501.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 7000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 129.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.051056694239377975, "kl": 0.013808176852762699, "learning_rate": 1.6666666666666667e-06, "loss": 0.0007, "num_tokens": 2087769.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 129.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.014025627635419369, "kl": 0.002051351562840864, "learning_rate": 1.6661111111111112e-06, "loss": 0.0001, "num_tokens": 2088065.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 129.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00955331139266491, "kl": 0.0002955526069854386, "learning_rate": 1.6655555555555558e-06, "loss": 0.0, "num_tokens": 2088321.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 129.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02317102998495102, "kl": 0.06966765224933624, "learning_rate": 1.6650000000000002e-06, "loss": 0.0035, "num_tokens": 2088686.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 129.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.004519567359238863, "kl": 0.010308153927326202, "learning_rate": 1.6644444444444447e-06, "loss": 0.0005, "num_tokens": 2088922.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7005 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 129.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.3530526161193848, "kl": 0.1490822657942772, "learning_rate": 1.663888888888889e-06, "loss": 0.0072, "num_tokens": 2089282.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 7006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 129.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.059950392693281174, "kl": 0.0025029381504282355, "learning_rate": 1.6633333333333334e-06, "loss": 0.0001, "num_tokens": 2089550.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 129.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02352936938405037, "kl": 0.010735549032688141, "learning_rate": 1.6627777777777778e-06, "loss": 0.0005, "num_tokens": 2089862.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 129.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03456749767065048, "kl": 0.007511656731367111, "learning_rate": 1.6622222222222224e-06, "loss": 0.0004, "num_tokens": 2090192.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 129.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.058970384299755096, "kl": 0.0416389349848032, "learning_rate": 1.6616666666666667e-06, "loss": 0.0021, "num_tokens": 2090549.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 129.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.017547249794006348, "kl": 0.25225260853767395, "learning_rate": 1.661111111111111e-06, "loss": 0.0126, "num_tokens": 2090847.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 129.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009655811823904514, "kl": 0.009675743989646435, "learning_rate": 1.6605555555555558e-06, "loss": 0.0005, "num_tokens": 2091119.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 129.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.07852412760257721, "kl": 0.021730611100792885, "learning_rate": 1.6600000000000002e-06, "loss": 0.0011, "num_tokens": 2091453.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7013 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 129.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.1413116455078125, "kl": 0.05915989726781845, "learning_rate": 1.6594444444444446e-06, "loss": -0.1256, "num_tokens": 2091902.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 7014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 129.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.007426044438034296, "kl": 0.0016364604234695435, "learning_rate": 1.6588888888888891e-06, "loss": 0.0001, "num_tokens": 2092114.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 129.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03501538187265396, "kl": 0.00457228347659111, "learning_rate": 1.6583333333333335e-06, "loss": 0.0002, "num_tokens": 2092374.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 129.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03467134013772011, "kl": 0.1816147416830063, "learning_rate": 1.6577777777777778e-06, "loss": 0.0091, "num_tokens": 2092658.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 129.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04386475309729576, "kl": 0.012325247284024954, "learning_rate": 1.6572222222222222e-06, "loss": 0.0006, "num_tokens": 2092964.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 129.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.1304142475128174, "kl": 0.03567609190940857, "learning_rate": 1.6566666666666668e-06, "loss": -0.164, "num_tokens": 2093320.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 7019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 130.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03802544251084328, "kl": 0.027663599175866693, "learning_rate": 1.6561111111111111e-06, "loss": 0.0015, "num_tokens": 2093618.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 130.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.337768077850342, "kl": 0.2487993948161602, "learning_rate": 1.6555555555555559e-06, "loss": 0.0386, "num_tokens": 2094005.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 130.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.698171854019165, "kl": 0.04927871748805046, "learning_rate": 1.6550000000000002e-06, "loss": 0.3536, "num_tokens": 2094427.0, "reward": 7.5, "reward_std": 1.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 1.0, "step": 7022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 130.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.006737370975315571, "kl": 0.0005606033519143239, "learning_rate": 1.6544444444444446e-06, "loss": 0.0, "num_tokens": 2094723.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 130.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.006347951479256153, "kl": 0.2263498604297638, "learning_rate": 1.653888888888889e-06, "loss": 0.0113, "num_tokens": 2095025.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 130.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.105105459690094, "kl": 0.014772042632102966, "learning_rate": 1.6533333333333335e-06, "loss": 0.0007, "num_tokens": 2095231.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 130.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.008042697794735432, "kl": 0.0004924511013086885, "learning_rate": 1.6527777777777779e-06, "loss": 0.0, "num_tokens": 2095503.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 130.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.4440840482711792, "kl": 0.17602595314383507, "learning_rate": 1.6522222222222222e-06, "loss": 0.0094, "num_tokens": 2095926.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 130.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 51.27631378173828, "kl": 6.599862158298492, "learning_rate": 1.6516666666666666e-06, "loss": 0.154, "num_tokens": 2096240.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 130.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.024575740098953247, "kl": 0.01028626598417759, "learning_rate": 1.6511111111111112e-06, "loss": 0.0005, "num_tokens": 2096552.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 130.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.007575655356049538, "kl": 0.0016424357891082764, "learning_rate": 1.6505555555555557e-06, "loss": 0.0001, "num_tokens": 2096764.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 130.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.5713506937026978, "kl": 0.15412794426083565, "learning_rate": 1.6500000000000003e-06, "loss": 0.0081, "num_tokens": 2097091.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 130.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.2734457850456238, "kl": 0.03362005180679262, "learning_rate": 1.6494444444444447e-06, "loss": 0.0025, "num_tokens": 2097378.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 130.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.17368796467781067, "kl": 0.043813359923660755, "learning_rate": 1.648888888888889e-06, "loss": 0.0027, "num_tokens": 2097728.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 130.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 12.998973846435547, "kl": 0.0898030512034893, "learning_rate": 1.6483333333333334e-06, "loss": 0.0973, "num_tokens": 2098034.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 7034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 130.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04694867879152298, "kl": 0.013498855289071798, "learning_rate": 1.647777777777778e-06, "loss": 0.0007, "num_tokens": 2098352.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 130.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.39042121171951294, "kl": 0.03961814066860825, "learning_rate": 1.6472222222222223e-06, "loss": 0.0026, "num_tokens": 2098592.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 130.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06808025389909744, "kl": 0.028306610882282257, "learning_rate": 1.6466666666666666e-06, "loss": 0.0014, "num_tokens": 2098890.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 130.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.005875167436897755, "kl": 0.01724378950893879, "learning_rate": 1.6461111111111114e-06, "loss": 0.0009, "num_tokens": 2099150.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 130.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1691375970840454, "kl": 0.08046058565378189, "learning_rate": 1.6455555555555558e-06, "loss": 0.0038, "num_tokens": 2099599.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 130.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.019924301654100418, "kl": 0.003623761935159564, "learning_rate": 1.6450000000000001e-06, "loss": 0.0002, "num_tokens": 2099899.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 130.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03315142169594765, "kl": 0.010150464251637459, "learning_rate": 1.6444444444444447e-06, "loss": 0.0005, "num_tokens": 2100191.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 130.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 2.499457597732544, "kl": 0.0646841935813427, "learning_rate": 1.643888888888889e-06, "loss": 0.0238, "num_tokens": 2100557.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 130.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.007721553090959787, "kl": 0.00017073153867386281, "learning_rate": 1.6433333333333334e-06, "loss": 0.0, "num_tokens": 2100813.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 130.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02820155955851078, "kl": 0.006299514789134264, "learning_rate": 1.6427777777777778e-06, "loss": 0.0003, "num_tokens": 2101101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 130.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.020983237773180008, "kl": 0.009813343174755573, "learning_rate": 1.6422222222222223e-06, "loss": 0.0005, "num_tokens": 2101373.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 130.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08659570664167404, "kl": 0.0883437767624855, "learning_rate": 1.6416666666666667e-06, "loss": 0.0044, "num_tokens": 2101656.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014285714365541935, "clip_ratio/low_min": 0.014285714365541935, "clip_ratio/region_mean": 0.014285714365541935, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 130.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.036041021347046, "kl": 0.10400204360485077, "learning_rate": 1.6411111111111115e-06, "loss": -0.0374, "num_tokens": 2101963.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 130.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02893601916730404, "kl": 0.004492097534239292, "learning_rate": 1.6405555555555558e-06, "loss": 0.0002, "num_tokens": 2102231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 130.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06141865625977516, "kl": 0.014061595313251019, "learning_rate": 1.6400000000000002e-06, "loss": 0.0006, "num_tokens": 2102544.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 130.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.06686927378177643, "kl": 0.00932990713045001, "learning_rate": 1.6394444444444445e-06, "loss": 0.0005, "num_tokens": 2102855.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 130.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.09910933673381805, "kl": 0.00973056920338422, "learning_rate": 1.638888888888889e-06, "loss": 0.0005, "num_tokens": 2103118.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 130.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.1035124808549881, "kl": 0.0282320911064744, "learning_rate": 1.6383333333333335e-06, "loss": 0.0014, "num_tokens": 2103410.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 130.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011763739166781306, "kl": 1.0274350643157959e-05, "learning_rate": 1.6377777777777778e-06, "loss": 0.0, "num_tokens": 2103630.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 130.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008183963363990188, "kl": 0.0017551205237396061, "learning_rate": 1.6372222222222222e-06, "loss": 0.0001, "num_tokens": 2103910.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 130.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06366243213415146, "kl": 0.0023180306889116764, "learning_rate": 1.6366666666666667e-06, "loss": 0.0001, "num_tokens": 2104123.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 130.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0999535545706749, "kl": 0.022442468907684088, "learning_rate": 1.6361111111111113e-06, "loss": 0.0012, "num_tokens": 2104422.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 130.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07055215537548065, "kl": 0.11936882883310318, "learning_rate": 1.6355555555555559e-06, "loss": 0.006, "num_tokens": 2104728.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 130.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.241089344024658, "kl": 0.010932185308774933, "learning_rate": 1.6350000000000002e-06, "loss": 0.1775, "num_tokens": 2104999.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 7058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 130.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.0063958168029785, "kl": 0.20750651508569717, "learning_rate": 1.6344444444444446e-06, "loss": 0.1477, "num_tokens": 2105295.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 130.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.014699546620249748, "kl": 0.006494863424450159, "learning_rate": 1.633888888888889e-06, "loss": 0.0003, "num_tokens": 2105575.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 130.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 9.681075096130371, "kl": 0.6894440688192844, "learning_rate": 1.6333333333333335e-06, "loss": 0.0312, "num_tokens": 2105937.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 130.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.23156550526618958, "kl": 0.05420555733144283, "learning_rate": 1.6327777777777779e-06, "loss": 0.0028, "num_tokens": 2106219.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 130.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.3359811305999756, "kl": 0.10302901454269886, "learning_rate": 1.6322222222222222e-06, "loss": -0.0383, "num_tokens": 2106581.0, "reward": 3.625, "reward_std": 4.589389801025391, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 4.589389801025391, "step": 7063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 130.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004384786821901798, "kl": 0.010348670184612274, "learning_rate": 1.6316666666666666e-06, "loss": 0.0005, "num_tokens": 2106817.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 130.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028586622793227434, "kl": 0.2823387533426285, "learning_rate": 1.6311111111111114e-06, "loss": 0.0141, "num_tokens": 2107105.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 130.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.035208094865083694, "kl": 0.0026562511920928955, "learning_rate": 1.6305555555555557e-06, "loss": 0.0001, "num_tokens": 2107349.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 130.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.046371940523386, "kl": 0.0038525641430169344, "learning_rate": 1.6300000000000003e-06, "loss": 0.0002, "num_tokens": 2107607.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 130.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0339149571955204, "kl": 0.004260573536157608, "learning_rate": 1.6294444444444446e-06, "loss": 0.0002, "num_tokens": 2107867.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 130.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.022969435900449753, "kl": 0.004480662057176232, "learning_rate": 1.628888888888889e-06, "loss": 0.0002, "num_tokens": 2108151.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 130.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.22026993334293365, "kl": 0.18869464844465256, "learning_rate": 1.6283333333333333e-06, "loss": 0.0094, "num_tokens": 2108504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 130.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.223470076918602, "kl": 0.024962180483271368, "learning_rate": 1.627777777777778e-06, "loss": 0.0014, "num_tokens": 2108725.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 130.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0726991593837738, "kl": 0.028438854962587357, "learning_rate": 1.6272222222222223e-06, "loss": 0.0014, "num_tokens": 2108973.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 130.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011091436259448528, "kl": 0.0015905426116660237, "learning_rate": 1.6266666666666666e-06, "loss": 0.0001, "num_tokens": 2109293.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 131.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.04307812079787254, "kl": 0.04177049919962883, "learning_rate": 1.6261111111111114e-06, "loss": 0.0021, "num_tokens": 2109643.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 131.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08882828801870346, "kl": 0.16069677472114563, "learning_rate": 1.6255555555555558e-06, "loss": 0.008, "num_tokens": 2109978.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 131.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.007404453121125698, "kl": 0.0016242563724517822, "learning_rate": 1.6250000000000001e-06, "loss": 0.0001, "num_tokens": 2110190.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 131.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.04313664510846138, "kl": 0.004895076155662537, "learning_rate": 1.6244444444444447e-06, "loss": 0.0002, "num_tokens": 2110450.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 131.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0911942645907402, "kl": 0.009294331073760986, "learning_rate": 1.623888888888889e-06, "loss": 0.0005, "num_tokens": 2110658.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 131.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.013720417395234108, "kl": 0.07706823199987411, "learning_rate": 1.6233333333333334e-06, "loss": 0.0038, "num_tokens": 2111094.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 131.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.06474006175994873, "kl": 0.01403840258717537, "learning_rate": 1.6227777777777777e-06, "loss": 0.0007, "num_tokens": 2111389.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 131.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.026664920151233673, "kl": 0.009803671389818192, "learning_rate": 1.6222222222222223e-06, "loss": 0.0005, "num_tokens": 2111695.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 131.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03790074959397316, "kl": 0.005417557433247566, "learning_rate": 1.6216666666666667e-06, "loss": 0.0003, "num_tokens": 2111983.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 131.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.08104467391967773, "kl": 0.003974196151830256, "learning_rate": 1.6211111111111114e-06, "loss": 0.0002, "num_tokens": 2112251.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 131.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04001113027334213, "kl": 0.00166324225574499, "learning_rate": 1.6205555555555558e-06, "loss": 0.0001, "num_tokens": 2112470.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 131.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00959837343543768, "kl": 0.003456221253145486, "learning_rate": 1.6200000000000002e-06, "loss": 0.0002, "num_tokens": 2112736.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 131.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.611340522766113, "kl": 0.09737893636338413, "learning_rate": 1.6194444444444445e-06, "loss": -0.2854, "num_tokens": 2113058.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 131.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.787550210952759, "kl": 0.2796998992562294, "learning_rate": 1.618888888888889e-06, "loss": 0.0105, "num_tokens": 2113420.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 7087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 131.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.00016049064288381487, "kl": 2.1412968635559082e-05, "learning_rate": 1.6183333333333334e-06, "loss": 0.0, "num_tokens": 2113640.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 131.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02721322514116764, "kl": 0.007995512103661895, "learning_rate": 1.6177777777777778e-06, "loss": 0.0004, "num_tokens": 2113913.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 131.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.23867952823638916, "kl": 0.05088693555444479, "learning_rate": 1.6172222222222221e-06, "loss": 0.0025, "num_tokens": 2114190.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 131.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01900135725736618, "kl": 0.002456976566463709, "learning_rate": 1.6166666666666667e-06, "loss": 0.0001, "num_tokens": 2114439.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 131.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.023275157436728477, "kl": 0.004797910340130329, "learning_rate": 1.6161111111111113e-06, "loss": 0.0002, "num_tokens": 2114745.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 131.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.507219910621643, "kl": 0.05045771971344948, "learning_rate": 1.6155555555555559e-06, "loss": -0.031, "num_tokens": 2115108.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 131.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.568248748779297, "kl": 0.07637689262628555, "learning_rate": 1.6150000000000002e-06, "loss": 0.0578, "num_tokens": 2115478.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 131.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03218376263976097, "kl": 0.002290316508151591, "learning_rate": 1.6144444444444446e-06, "loss": 0.0001, "num_tokens": 2115744.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 131.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.11209921538829803, "kl": 0.10673601180315018, "learning_rate": 1.613888888888889e-06, "loss": 0.0057, "num_tokens": 2116051.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 131.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.128663569688797, "kl": 0.01909100916236639, "learning_rate": 1.6133333333333335e-06, "loss": 0.0011, "num_tokens": 2116339.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 131.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.2546357810497284, "kl": 0.0969974584877491, "learning_rate": 1.6127777777777778e-06, "loss": 0.0048, "num_tokens": 2116637.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 131.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05552547797560692, "kl": 0.02452223002910614, "learning_rate": 1.6122222222222222e-06, "loss": 0.0011, "num_tokens": 2116867.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 131.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.22693820297718048, "kl": 0.04279670864343643, "learning_rate": 1.611666666666667e-06, "loss": 0.0025, "num_tokens": 2117146.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 131.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.09104184061288834, "kl": 0.017268257215619087, "learning_rate": 1.6111111111111113e-06, "loss": 0.0009, "num_tokens": 2117407.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 131.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05299137160181999, "kl": 0.01431264029815793, "learning_rate": 1.6105555555555557e-06, "loss": 0.0007, "num_tokens": 2117679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 131.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051735807210206985, "kl": 0.0007815450371708721, "learning_rate": 1.6100000000000003e-06, "loss": 0.0, "num_tokens": 2117939.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 131.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.05851719155907631, "kl": 0.05261060409247875, "learning_rate": 1.6094444444444446e-06, "loss": 0.0026, "num_tokens": 2118397.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 131.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.1501002311706543, "kl": 0.026964912191033363, "learning_rate": 1.608888888888889e-06, "loss": 0.0013, "num_tokens": 2118713.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 131.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.08613701164722443, "kl": 0.16121889650821686, "learning_rate": 1.6083333333333333e-06, "loss": 0.0081, "num_tokens": 2119022.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 131.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.118946075439453, "kl": 0.15464754402637482, "learning_rate": 1.6077777777777779e-06, "loss": 0.2329, "num_tokens": 2119332.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 131.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.287919044494629, "kl": 0.03921413980424404, "learning_rate": 1.6072222222222222e-06, "loss": 0.0146, "num_tokens": 2119628.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 131.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0746886357665062, "kl": 0.014823246747255325, "learning_rate": 1.606666666666667e-06, "loss": 0.0007, "num_tokens": 2119957.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 131.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.010058668442070484, "kl": 0.23919252306222916, "learning_rate": 1.6061111111111114e-06, "loss": 0.0119, "num_tokens": 2120257.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 131.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.025824112817645073, "kl": 0.001475628698244691, "learning_rate": 1.6055555555555557e-06, "loss": 0.0001, "num_tokens": 2120491.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 131.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0565282441675663, "kl": 0.008004487259313464, "learning_rate": 1.605e-06, "loss": 0.0004, "num_tokens": 2120800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 131.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.13785575330257416, "kl": 0.0428971815854311, "learning_rate": 1.6044444444444447e-06, "loss": 0.0022, "num_tokens": 2121096.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 131.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888969749212265, "kl": 0.039200250059366226, "learning_rate": 1.603888888888889e-06, "loss": 0.0024, "num_tokens": 2121403.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 131.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0061604841612279415, "kl": 0.0015731186140328646, "learning_rate": 1.6033333333333334e-06, "loss": 0.0001, "num_tokens": 2121723.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 131.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.005915297195315361, "kl": 0.00015038550191093236, "learning_rate": 1.6027777777777777e-06, "loss": 0.0, "num_tokens": 2121979.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 131.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.13039585947990417, "kl": 0.03150393161922693, "learning_rate": 1.6022222222222223e-06, "loss": 0.0015, "num_tokens": 2122312.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 131.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.2580556869506836, "kl": 0.039731867611408234, "learning_rate": 1.6016666666666669e-06, "loss": 0.3094, "num_tokens": 2122636.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 131.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01591704599559307, "kl": 0.021352888084948063, "learning_rate": 1.6011111111111114e-06, "loss": 0.0011, "num_tokens": 2122928.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 131.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.1165554523468018, "kl": 0.07283507660031319, "learning_rate": 1.6005555555555558e-06, "loss": -0.2145, "num_tokens": 2123321.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 131.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.034905437380075455, "kl": 0.0027291610604152083, "learning_rate": 1.6000000000000001e-06, "loss": 0.0001, "num_tokens": 2123581.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 131.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07433158159255981, "kl": 0.02408934012055397, "learning_rate": 1.5994444444444445e-06, "loss": 0.0012, "num_tokens": 2123924.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 131.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 5.7796125411987305, "kl": 0.10430440120398998, "learning_rate": 1.598888888888889e-06, "loss": -0.0375, "num_tokens": 2124259.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 131.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0046536605805158615, "kl": 0.010251656174659729, "learning_rate": 1.5983333333333334e-06, "loss": 0.0005, "num_tokens": 2124495.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 131.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.003372194478288293, "kl": 0.282270610332489, "learning_rate": 1.5977777777777778e-06, "loss": 0.0141, "num_tokens": 2124783.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 131.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01675453595817089, "kl": 0.0005927532911300659, "learning_rate": 1.5972222222222221e-06, "loss": 0.0, "num_tokens": 2124995.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 131.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.009419669397175312, "kl": 0.00753821711987257, "learning_rate": 1.596666666666667e-06, "loss": 0.0004, "num_tokens": 2125307.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 132.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.021822916343808174, "kl": 0.06750452145934105, "learning_rate": 1.5961111111111113e-06, "loss": 0.0034, "num_tokens": 2125679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 132.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10351109504699707, "kl": 0.027773371897637844, "learning_rate": 1.5955555555555558e-06, "loss": 0.0017, "num_tokens": 2125956.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 132.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.021825645118951797, "kl": 0.18637976795434952, "learning_rate": 1.5950000000000002e-06, "loss": 0.0093, "num_tokens": 2126240.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 132.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.023564694449305534, "kl": 0.004358649137429893, "learning_rate": 1.5944444444444445e-06, "loss": 0.0002, "num_tokens": 2126528.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 132.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.3271913528442383, "kl": 0.059478598181158304, "learning_rate": 1.593888888888889e-06, "loss": 0.003, "num_tokens": 2126834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 132.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.009647190570831299, "kl": 0.00022002458717906848, "learning_rate": 1.5933333333333335e-06, "loss": 0.0, "num_tokens": 2127091.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 132.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.09200785309076309, "kl": 0.01710047945380211, "learning_rate": 1.5927777777777778e-06, "loss": 0.0009, "num_tokens": 2127352.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.07233866304159164, "kl": 0.024635879322886467, "learning_rate": 1.5922222222222222e-06, "loss": 0.0012, "num_tokens": 2127578.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 132.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.010494797490537167, "kl": 0.0021570250391960144, "learning_rate": 1.591666666666667e-06, "loss": 0.0001, "num_tokens": 2127822.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 132.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.2869211435317993, "kl": 0.40816404670476913, "learning_rate": 1.5911111111111113e-06, "loss": -0.0272, "num_tokens": 2128182.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 132.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.007684871554374695, "kl": 0.0013354122638702393, "learning_rate": 1.5905555555555557e-06, "loss": 0.0001, "num_tokens": 2128394.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 132.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.15297986567020416, "kl": 0.022742931731045246, "learning_rate": 1.5900000000000002e-06, "loss": 0.0012, "num_tokens": 2128734.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 132.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.08242428302764893, "kl": 0.0884629376232624, "learning_rate": 1.5894444444444446e-06, "loss": 0.0045, "num_tokens": 2129096.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 132.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.033901914954185486, "kl": 0.016642311587929726, "learning_rate": 1.588888888888889e-06, "loss": 0.0008, "num_tokens": 2129432.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 132.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04525871202349663, "kl": 0.013560467399656773, "learning_rate": 1.5883333333333333e-06, "loss": 0.0007, "num_tokens": 2129730.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 132.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.012694064527750015, "kl": 0.005606785416603088, "learning_rate": 1.5877777777777779e-06, "loss": 0.0002, "num_tokens": 2129938.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 132.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1506829857826233, "kl": 0.17901617288589478, "learning_rate": 1.5872222222222222e-06, "loss": 0.009, "num_tokens": 2130254.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 132.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09926097095012665, "kl": 0.0339282164350152, "learning_rate": 1.586666666666667e-06, "loss": 0.0017, "num_tokens": 2130617.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 132.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.05642816796898842, "kl": 0.0037209689617156982, "learning_rate": 1.5861111111111114e-06, "loss": 0.0002, "num_tokens": 2130869.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 132.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.019299596548080444, "kl": 0.0017907322035171092, "learning_rate": 1.5855555555555557e-06, "loss": 0.0001, "num_tokens": 2131192.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 132.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.016435429453849792, "kl": 0.07526886090636253, "learning_rate": 1.585e-06, "loss": 0.0037, "num_tokens": 2131632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 132.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.14995141327381134, "kl": 0.016953308135271072, "learning_rate": 1.5844444444444446e-06, "loss": 0.0008, "num_tokens": 2131892.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04010741412639618, "kl": 0.0018369376193732023, "learning_rate": 1.583888888888889e-06, "loss": 0.0001, "num_tokens": 2132111.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 132.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.045413073152303696, "kl": 0.31743405759334564, "learning_rate": 1.5833333333333333e-06, "loss": 0.0157, "num_tokens": 2132438.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 132.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.015144754201173782, "kl": 0.2519923895597458, "learning_rate": 1.5827777777777777e-06, "loss": 0.0126, "num_tokens": 2132736.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12200044840574265, "kl": 0.011101612821221352, "learning_rate": 1.5822222222222223e-06, "loss": 0.0009, "num_tokens": 2132961.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1892084777355194, "kl": 0.028345356695353985, "learning_rate": 1.5816666666666668e-06, "loss": 0.0016, "num_tokens": 2133284.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 132.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.189661026000977, "kl": 0.34900492429733276, "learning_rate": 1.5811111111111114e-06, "loss": 0.1196, "num_tokens": 2133637.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 7155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.639244079589844, "kl": 0.12331192195415497, "learning_rate": 1.5805555555555558e-06, "loss": 0.0361, "num_tokens": 2133909.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 132.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.11795959621667862, "kl": 0.04867975041270256, "learning_rate": 1.5800000000000001e-06, "loss": 0.0025, "num_tokens": 2134278.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.009660028852522373, "kl": 0.0035163608845323324, "learning_rate": 1.5794444444444445e-06, "loss": 0.0002, "num_tokens": 2134548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 132.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008618413121439517, "kl": 0.0017343653016723692, "learning_rate": 1.578888888888889e-06, "loss": 0.0001, "num_tokens": 2134828.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 132.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02396208420395851, "kl": 0.0013054652226855978, "learning_rate": 1.5783333333333334e-06, "loss": 0.0001, "num_tokens": 2135063.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 132.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04878072068095207, "kl": 0.012942212517373264, "learning_rate": 1.5777777777777778e-06, "loss": 0.0007, "num_tokens": 2135349.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 132.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.06698031723499298, "kl": 0.020975886844098568, "learning_rate": 1.5772222222222225e-06, "loss": 0.0011, "num_tokens": 2135645.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 132.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06225673854351044, "kl": 0.008317965548485518, "learning_rate": 1.5766666666666669e-06, "loss": 0.0004, "num_tokens": 2135956.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.00913753267377615, "kl": 0.000998860050458461, "learning_rate": 1.5761111111111112e-06, "loss": 0.0, "num_tokens": 2136252.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.004525170661509037, "kl": 0.010284863412380219, "learning_rate": 1.5755555555555558e-06, "loss": 0.0005, "num_tokens": 2136488.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 132.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.019502563402056694, "kl": 0.0071472483687102795, "learning_rate": 1.5750000000000002e-06, "loss": 0.0004, "num_tokens": 2136760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 132.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.275188446044922, "kl": 0.07081369159277529, "learning_rate": 1.5744444444444445e-06, "loss": -0.0473, "num_tokens": 2137033.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 7167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 132.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.39246514439582825, "kl": 0.034386447980068624, "learning_rate": 1.5738888888888889e-06, "loss": 0.0021, "num_tokens": 2137305.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7168 }, { "clip_ratio/high_max": 0.008064515888690948, "clip_ratio/high_mean": 0.008064515888690948, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008064515888690948, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 132.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.127068042755127, "kl": 0.07933797687292099, "learning_rate": 1.5733333333333334e-06, "loss": 0.1291, "num_tokens": 2137636.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 7169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 132.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07368350774049759, "kl": 0.28700336813926697, "learning_rate": 1.5727777777777778e-06, "loss": 0.0144, "num_tokens": 2137924.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.871882915496826, "kl": 0.12197799421846867, "learning_rate": 1.5722222222222226e-06, "loss": 0.2164, "num_tokens": 2138209.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 132.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02604195661842823, "kl": 0.005928986705839634, "learning_rate": 1.571666666666667e-06, "loss": 0.0003, "num_tokens": 2138493.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 132.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015223451191559434, "kl": 1.782923936843872e-05, "learning_rate": 1.5711111111111113e-06, "loss": 0.0, "num_tokens": 2138713.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 132.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005958667024970055, "kl": 0.0009195625898428261, "learning_rate": 1.5705555555555556e-06, "loss": 0.0, "num_tokens": 2138973.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 132.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 1.4503672122955322, "kl": 0.0719026941806078, "learning_rate": 1.5700000000000002e-06, "loss": -0.2037, "num_tokens": 2139360.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 132.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03716137260198593, "kl": 0.07004294544458389, "learning_rate": 1.5694444444444446e-06, "loss": 0.0035, "num_tokens": 2139664.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 132.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.023132674396038055, "kl": 0.002655446412973106, "learning_rate": 1.568888888888889e-06, "loss": 0.0001, "num_tokens": 2139948.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 132.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.034224361181259155, "kl": 0.004553021863102913, "learning_rate": 1.5683333333333333e-06, "loss": 0.0002, "num_tokens": 2140260.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 132.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.5939126014709473, "kl": 0.12213754653930664, "learning_rate": 1.5677777777777778e-06, "loss": 0.1029, "num_tokens": 2140607.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 7179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 132.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08168300241231918, "kl": 0.014857674017548561, "learning_rate": 1.5672222222222224e-06, "loss": 0.0007, "num_tokens": 2140895.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 132.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02062055841088295, "kl": 0.04684109427034855, "learning_rate": 1.566666666666667e-06, "loss": 0.0023, "num_tokens": 2141355.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 133.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.025504879653453827, "kl": 0.012581905350089073, "learning_rate": 1.5661111111111113e-06, "loss": 0.0006, "num_tokens": 2141671.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 133.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1026797741651535, "kl": 0.07569685578346252, "learning_rate": 1.5655555555555557e-06, "loss": 0.0041, "num_tokens": 2141952.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 133.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.02355432510376, "kl": 0.15920240490231663, "learning_rate": 1.565e-06, "loss": 0.089, "num_tokens": 2142241.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 133.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.019761310890316963, "kl": 0.1871432140469551, "learning_rate": 1.5644444444444446e-06, "loss": 0.0094, "num_tokens": 2142525.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 133.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05055435001850128, "kl": 0.01205674558877945, "learning_rate": 1.563888888888889e-06, "loss": 0.0006, "num_tokens": 2142795.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 133.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.005658216308802366, "kl": 0.00047732144594192505, "learning_rate": 1.5633333333333333e-06, "loss": 0.0, "num_tokens": 2143007.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 133.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02762504480779171, "kl": 0.0015152934356592596, "learning_rate": 1.5627777777777777e-06, "loss": 0.0001, "num_tokens": 2143241.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 133.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 6.380984306335449, "kl": 0.14068532455712557, "learning_rate": 1.5622222222222225e-06, "loss": 0.0562, "num_tokens": 2143514.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 7189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 133.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.7497425079345703, "kl": 0.06195956841111183, "learning_rate": 1.5616666666666668e-06, "loss": 0.0584, "num_tokens": 2143892.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 7190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 133.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.571511149406433, "kl": 0.01749340770766139, "learning_rate": 1.5611111111111114e-06, "loss": -0.0233, "num_tokens": 2144228.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 133.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037463619373738766, "kl": 0.28213731944561005, "learning_rate": 1.5605555555555557e-06, "loss": 0.0141, "num_tokens": 2144516.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 133.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1321907788515091, "kl": 0.07463345187716186, "learning_rate": 1.56e-06, "loss": 0.0037, "num_tokens": 2144786.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 133.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.029211105778813362, "kl": 0.022858545184135437, "learning_rate": 1.5594444444444445e-06, "loss": 0.0012, "num_tokens": 2145077.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 133.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.4455835819244385, "kl": 0.07830433547496796, "learning_rate": 1.558888888888889e-06, "loss": 0.1382, "num_tokens": 2145382.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 133.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0193022508174181, "kl": 0.0024699093773961067, "learning_rate": 1.5583333333333334e-06, "loss": 0.0001, "num_tokens": 2145631.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 133.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.57177996635437, "kl": 0.9576224047923461, "learning_rate": 1.5577777777777777e-06, "loss": 0.0703, "num_tokens": 2145931.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 133.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03149450942873955, "kl": 0.007283747661858797, "learning_rate": 1.5572222222222225e-06, "loss": 0.0004, "num_tokens": 2146229.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 133.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.267495632171631, "kl": 0.009472236735746264, "learning_rate": 1.5566666666666669e-06, "loss": -0.002, "num_tokens": 2146505.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 7199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 86.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 86.25, "completions/mean_terminated_length": 29.666667938232422, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 133.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.5303096771240234, "kl": 0.12602905184030533, "learning_rate": 1.5561111111111112e-06, "loss": 0.3952, "num_tokens": 2147066.0, "reward": 5.625, "reward_std": 4.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 4.75, "step": 7200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 133.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02512529492378235, "kl": 0.00975404866039753, "learning_rate": 1.5555555555555558e-06, "loss": 0.0005, "num_tokens": 2147378.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 133.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.014551995322108269, "kl": 0.13369908928871155, "learning_rate": 1.5550000000000001e-06, "loss": 0.0067, "num_tokens": 2147686.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 133.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.051657840609550476, "kl": 0.009373006876558065, "learning_rate": 1.5544444444444445e-06, "loss": 0.0005, "num_tokens": 2147959.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 133.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.004947669338434935, "kl": 0.010205760598182678, "learning_rate": 1.5538888888888889e-06, "loss": 0.0005, "num_tokens": 2148195.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 133.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.009209487587213516, "kl": 0.00018517375428928062, "learning_rate": 1.5533333333333334e-06, "loss": 0.0, "num_tokens": 2148451.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 133.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.018287107348442078, "kl": 0.0035643242299556732, "learning_rate": 1.5527777777777778e-06, "loss": 0.0002, "num_tokens": 2148711.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 133.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.6277499198913574, "kl": 0.24095821008086205, "learning_rate": 1.5522222222222226e-06, "loss": -0.0097, "num_tokens": 2149111.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 7207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 133.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.858672857284546, "kl": 0.12933551892638206, "learning_rate": 1.551666666666667e-06, "loss": -0.0656, "num_tokens": 2149522.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 133.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.060370463877916336, "kl": 0.0597477313131094, "learning_rate": 1.5511111111111113e-06, "loss": 0.0032, "num_tokens": 2149852.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 133.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10163125395774841, "kl": 0.020229718182235956, "learning_rate": 1.5505555555555556e-06, "loss": 0.001, "num_tokens": 2150151.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 133.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0767996683716774, "kl": 0.014332738239318132, "learning_rate": 1.5500000000000002e-06, "loss": 0.0007, "num_tokens": 2150503.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 133.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.016321726143360138, "kl": 0.0012876689434051514, "learning_rate": 1.5494444444444446e-06, "loss": 0.0001, "num_tokens": 2150715.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 133.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008146890322677791, "kl": 0.0017434380133636296, "learning_rate": 1.548888888888889e-06, "loss": 0.0001, "num_tokens": 2150995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 133.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 4.034658908843994, "kl": 0.025983136147260666, "learning_rate": 1.5483333333333333e-06, "loss": -0.065, "num_tokens": 2151287.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 133.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004796174354851246, "kl": 0.22626761347055435, "learning_rate": 1.5477777777777778e-06, "loss": 0.0113, "num_tokens": 2151589.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 133.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.01525871455669403, "kl": 0.04600161127746105, "learning_rate": 1.5472222222222224e-06, "loss": 0.0023, "num_tokens": 2152049.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 133.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04449987784028053, "kl": 0.00501628452911973, "learning_rate": 1.546666666666667e-06, "loss": 0.0002, "num_tokens": 2152311.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 124.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 80.66667175292969, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 133.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.297121286392212, "kl": 0.053700391203165054, "learning_rate": 1.5461111111111113e-06, "loss": 0.5948, "num_tokens": 2153045.0, "reward": 3.424999952316284, "reward_std": 5.249364852905273, "rewards/reward_combined/mean": 3.424999952316284, "rewards/reward_combined/std": 5.249364852905273, "step": 7218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 133.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.02140449360013008, "kl": 0.0038912349846214056, "learning_rate": 1.5455555555555557e-06, "loss": 0.0002, "num_tokens": 2153353.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 133.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.18100784718990326, "kl": 0.07226749882102013, "learning_rate": 1.545e-06, "loss": 0.0029, "num_tokens": 2153699.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 133.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 5.726509094238281, "kl": 2.046635246835649, "learning_rate": 1.5444444444444446e-06, "loss": -0.0102, "num_tokens": 2153956.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 133.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0866616889834404, "kl": 0.005959659232757986, "learning_rate": 1.543888888888889e-06, "loss": 0.0003, "num_tokens": 2154247.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 133.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.017462989315390587, "kl": 0.0014639954315498471, "learning_rate": 1.5433333333333333e-06, "loss": 0.0001, "num_tokens": 2154513.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 133.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.006658535450696945, "kl": 0.001439347630366683, "learning_rate": 1.542777777777778e-06, "loss": 0.0001, "num_tokens": 2154834.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 133.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001587972801644355, "kl": 2.0422041416168213e-05, "learning_rate": 1.5422222222222224e-06, "loss": 0.0, "num_tokens": 2155054.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 133.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0972246304154396, "kl": 0.019220730289816856, "learning_rate": 1.5416666666666668e-06, "loss": 0.001, "num_tokens": 2155370.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 133.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.064311183989048, "kl": 0.018906302750110626, "learning_rate": 1.5411111111111114e-06, "loss": 0.0009, "num_tokens": 2155705.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7227 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 133.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.4054067134857178, "kl": 0.10059146955609322, "learning_rate": 1.5405555555555557e-06, "loss": -0.1078, "num_tokens": 2156018.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 89.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 89.0, "completions/mean_terminated_length": 33.333335876464844, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 133.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 1.5345656871795654, "kl": 0.2699839845299721, "learning_rate": 1.54e-06, "loss": 0.4267, "num_tokens": 2156598.0, "reward": 4.675000190734863, "reward_std": 4.221670150756836, "rewards/reward_combined/mean": 4.675000190734863, "rewards/reward_combined/std": 4.221670627593994, "step": 7229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 133.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.004068436101078987, "kl": 0.00046710371680092067, "learning_rate": 1.5394444444444444e-06, "loss": 0.0, "num_tokens": 2156818.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 133.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.029137171804904938, "kl": 0.027690690010786057, "learning_rate": 1.538888888888889e-06, "loss": 0.0014, "num_tokens": 2157037.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 133.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.057238731533288956, "kl": 0.004290816141292453, "learning_rate": 1.5383333333333334e-06, "loss": 0.0002, "num_tokens": 2157291.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 133.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.045201536267995834, "kl": 0.09438633546233177, "learning_rate": 1.5377777777777781e-06, "loss": 0.0047, "num_tokens": 2157691.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 133.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03442693129181862, "kl": 0.0032054558396339417, "learning_rate": 1.5372222222222225e-06, "loss": 0.0001, "num_tokens": 2157901.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 133.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00833001360297203, "kl": 0.0020173738594166934, "learning_rate": 1.5366666666666668e-06, "loss": 0.0001, "num_tokens": 2158213.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 134.0, "frac_reward_zero_std": 1.0, "grad_norm": 1.4568589925765991, "kl": 0.1367518436163664, "learning_rate": 1.5361111111111112e-06, "loss": 0.0073, "num_tokens": 2158512.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 134.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04480183497071266, "kl": 0.005468477262184024, "learning_rate": 1.5355555555555558e-06, "loss": 0.0003, "num_tokens": 2158784.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 134.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.09231133759021759, "kl": 0.01750053931027651, "learning_rate": 1.5350000000000001e-06, "loss": 0.0008, "num_tokens": 2159060.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 134.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03440945968031883, "kl": 0.015506135765463114, "learning_rate": 1.5344444444444445e-06, "loss": 0.0008, "num_tokens": 2159394.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 134.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.008919189684092999, "kl": 0.000833466649055481, "learning_rate": 1.5338888888888888e-06, "loss": 0.0, "num_tokens": 2159666.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 134.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.037250444293022156, "kl": 0.0036595771089196205, "learning_rate": 1.5333333333333334e-06, "loss": 0.0002, "num_tokens": 2159952.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 134.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.9197356700897217, "kl": 0.025629126466810703, "learning_rate": 1.532777777777778e-06, "loss": 0.2154, "num_tokens": 2160354.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 134.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.13581493496894836, "kl": 0.03992139920592308, "learning_rate": 1.5322222222222225e-06, "loss": 0.0019, "num_tokens": 2160704.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 134.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027889732737094164, "kl": 0.0013518094783648849, "learning_rate": 1.531666666666667e-06, "loss": 0.0001, "num_tokens": 2161024.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 134.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.022273847833275795, "kl": 0.0026639073621481657, "learning_rate": 1.5311111111111113e-06, "loss": 0.0001, "num_tokens": 2161305.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 134.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.016908859834074974, "kl": 0.003824492683634162, "learning_rate": 1.5305555555555556e-06, "loss": 0.0002, "num_tokens": 2161609.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 134.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.16338777542114258, "kl": 0.03419748321175575, "learning_rate": 1.5300000000000002e-06, "loss": 0.0018, "num_tokens": 2161854.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 134.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03393029421567917, "kl": 0.01674473285675049, "learning_rate": 1.5294444444444445e-06, "loss": 0.0008, "num_tokens": 2162181.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 134.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.037891633808612823, "kl": 0.15261739492416382, "learning_rate": 1.5288888888888889e-06, "loss": 0.0077, "num_tokens": 2162488.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 134.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.007127865217626095, "kl": 0.23891860991716385, "learning_rate": 1.5283333333333332e-06, "loss": 0.0119, "num_tokens": 2162788.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 134.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.7988146543502808, "kl": 0.0480719287879765, "learning_rate": 1.527777777777778e-06, "loss": 0.0017, "num_tokens": 2163085.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0071428571827709675, "clip_ratio/low_min": 0.0071428571827709675, "clip_ratio/region_mean": 0.0071428571827709675, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 134.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.3745758533477783, "kl": 0.1639673300087452, "learning_rate": 1.5272222222222224e-06, "loss": 0.0062, "num_tokens": 2163484.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 7252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 134.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.007688008714467287, "kl": 0.001299142837524414, "learning_rate": 1.526666666666667e-06, "loss": 0.0001, "num_tokens": 2163696.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 134.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.019358614459633827, "kl": 0.18689166009426117, "learning_rate": 1.5261111111111113e-06, "loss": 0.0093, "num_tokens": 2163980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 134.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.2184417247772217, "kl": 0.03633656119927764, "learning_rate": 1.5255555555555557e-06, "loss": 0.001, "num_tokens": 2164312.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 134.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.02543286234140396, "kl": 0.001096084713935852, "learning_rate": 1.525e-06, "loss": 0.0001, "num_tokens": 2164531.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 134.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 8.149341583251953, "kl": 0.012931601144373417, "learning_rate": 1.5244444444444446e-06, "loss": 0.0842, "num_tokens": 2164769.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 7257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 134.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 2.4841113090515137, "kl": 0.13793478906154633, "learning_rate": 1.523888888888889e-06, "loss": 0.1068, "num_tokens": 2165116.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 134.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08820168673992157, "kl": 0.07238448038697243, "learning_rate": 1.5233333333333333e-06, "loss": 0.0038, "num_tokens": 2165413.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 134.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.325691819190979, "kl": 0.11315299570560455, "learning_rate": 1.522777777777778e-06, "loss": 0.0058, "num_tokens": 2165859.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 134.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.15787656605243683, "kl": 0.08496073633432388, "learning_rate": 1.5222222222222224e-06, "loss": 0.0037, "num_tokens": 2166185.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 134.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.31198263168335, "kl": 0.09291960089467466, "learning_rate": 1.5216666666666668e-06, "loss": -0.0115, "num_tokens": 2166472.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 134.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.023829452693462372, "kl": 0.00873122364282608, "learning_rate": 1.5211111111111113e-06, "loss": 0.0004, "num_tokens": 2166804.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 134.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.040066637098789215, "kl": 0.0045352845918387175, "learning_rate": 1.5205555555555557e-06, "loss": 0.0002, "num_tokens": 2167064.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 134.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.027337247505784035, "kl": 0.002178741851821542, "learning_rate": 1.52e-06, "loss": 0.0001, "num_tokens": 2167331.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 134.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058513060212135315, "kl": 0.000874209392350167, "learning_rate": 1.5194444444444444e-06, "loss": 0.0, "num_tokens": 2167591.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 134.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.049668893218040466, "kl": 0.025118354707956314, "learning_rate": 1.518888888888889e-06, "loss": 0.0012, "num_tokens": 2167814.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 134.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.07718837261199951, "kl": 0.056175585836172104, "learning_rate": 1.5183333333333333e-06, "loss": 0.0031, "num_tokens": 2168090.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 134.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 5.2365851402282715, "kl": 0.5246870405972004, "learning_rate": 1.5177777777777781e-06, "loss": 0.0286, "num_tokens": 2168463.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 134.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03871531039476395, "kl": 0.013220083899796009, "learning_rate": 1.5172222222222225e-06, "loss": 0.0007, "num_tokens": 2168781.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 134.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011146622709929943, "kl": 0.005558419041335583, "learning_rate": 1.5166666666666668e-06, "loss": 0.0003, "num_tokens": 2169093.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 134.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.948258638381958, "kl": 0.11385248601436615, "learning_rate": 1.5161111111111112e-06, "loss": 0.0057, "num_tokens": 2169305.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 134.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.012006687000393867, "kl": 0.008160830475389957, "learning_rate": 1.5155555555555558e-06, "loss": 0.0004, "num_tokens": 2169578.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 134.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03982282057404518, "kl": 0.008831947925500572, "learning_rate": 1.5150000000000001e-06, "loss": 0.0004, "num_tokens": 2169850.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 134.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.010898241773247719, "kl": 0.0002897203085012734, "learning_rate": 1.5144444444444445e-06, "loss": 0.0, "num_tokens": 2170106.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 134.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0073975152336061, "kl": 0.0033451616764068604, "learning_rate": 1.5138888888888888e-06, "loss": 0.0002, "num_tokens": 2170366.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 134.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.03576982393860817, "kl": 0.0049235905753448606, "learning_rate": 1.5133333333333334e-06, "loss": 0.0002, "num_tokens": 2170677.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 134.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.013484302908182144, "kl": 0.040173305198550224, "learning_rate": 1.512777777777778e-06, "loss": 0.002, "num_tokens": 2171145.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 134.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015219366468954831, "kl": 1.6964972019195557e-05, "learning_rate": 1.5122222222222225e-06, "loss": 0.0, "num_tokens": 2171365.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 134.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.6734110713005066, "kl": 0.11648636311292648, "learning_rate": 1.5116666666666669e-06, "loss": 0.006, "num_tokens": 2171670.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 134.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.1569831520318985, "kl": 0.03949368093162775, "learning_rate": 1.5111111111111112e-06, "loss": 0.0018, "num_tokens": 2171974.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 134.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04143398255109787, "kl": 0.03825227916240692, "learning_rate": 1.5105555555555556e-06, "loss": 0.0019, "num_tokens": 2172269.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 134.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.013431035913527012, "kl": 0.015110201202332973, "learning_rate": 1.5100000000000002e-06, "loss": 0.0008, "num_tokens": 2172529.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 134.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07032839208841324, "kl": 0.006093740463256836, "learning_rate": 1.5094444444444445e-06, "loss": 0.0003, "num_tokens": 2172735.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 134.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.003963954746723175, "kl": 0.2820764482021332, "learning_rate": 1.5088888888888889e-06, "loss": 0.0141, "num_tokens": 2173023.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 134.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.12343496084213257, "kl": 0.030853819102048874, "learning_rate": 1.5083333333333336e-06, "loss": 0.0015, "num_tokens": 2173313.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 134.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.998082399368286, "kl": 0.01617884822189808, "learning_rate": 1.507777777777778e-06, "loss": -0.0337, "num_tokens": 2173612.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 134.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.004310925491154194, "kl": 0.010322198271751404, "learning_rate": 1.5072222222222224e-06, "loss": 0.0005, "num_tokens": 2173848.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 134.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.8445016145706177, "kl": 0.09790194779634476, "learning_rate": 1.506666666666667e-06, "loss": 0.0234, "num_tokens": 2174238.0, "reward": 3.75, "reward_std": 2.723355770111084, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 2.723355770111084, "step": 7289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 135.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.10184597223997116, "kl": 0.13429078459739685, "learning_rate": 1.5061111111111113e-06, "loss": 0.0068, "num_tokens": 2174589.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 135.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.034993600100278854, "kl": 0.005050043226219714, "learning_rate": 1.5055555555555556e-06, "loss": 0.0003, "num_tokens": 2174854.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 135.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 0.5175795555114746, "kl": 0.2549090050160885, "learning_rate": 1.505e-06, "loss": -0.1433, "num_tokens": 2175281.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 135.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.057604894042015076, "kl": 0.07324554771184921, "learning_rate": 1.5044444444444446e-06, "loss": 0.0037, "num_tokens": 2175593.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 135.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.802600383758545, "kl": 0.46119312942028046, "learning_rate": 1.503888888888889e-06, "loss": -0.0043, "num_tokens": 2175899.0, "reward": 5.625, "reward_std": 2.75, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 2.75, "step": 7294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 135.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026061483658850193, "kl": 0.00025004148483276367, "learning_rate": 1.5033333333333337e-06, "loss": 0.0, "num_tokens": 2176155.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 135.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009876529220491648, "kl": 0.0017432059394195676, "learning_rate": 1.502777777777778e-06, "loss": 0.0001, "num_tokens": 2176435.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 135.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.07706035673618317, "kl": 0.018631151411682367, "learning_rate": 1.5022222222222224e-06, "loss": 0.0009, "num_tokens": 2176771.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 135.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.3952693939208984, "kl": 0.03895064443349838, "learning_rate": 1.5016666666666668e-06, "loss": 0.1988, "num_tokens": 2177106.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7298 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 135.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.175672769546509, "kl": 0.0934031754732132, "learning_rate": 1.5011111111111113e-06, "loss": -0.0359, "num_tokens": 2177422.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 135.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01151592843234539, "kl": 0.0006222384399734437, "learning_rate": 1.5005555555555557e-06, "loss": 0.0, "num_tokens": 2177658.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 135.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.813812732696533, "kl": 0.0724743326427415, "learning_rate": 1.5e-06, "loss": -0.048, "num_tokens": 2177931.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 7301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.09224043041467667, "kl": 0.003516748547554016, "learning_rate": 1.4994444444444444e-06, "loss": 0.0002, "num_tokens": 2178143.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 135.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.037461452186107635, "kl": 0.005492026801221073, "learning_rate": 1.498888888888889e-06, "loss": 0.0003, "num_tokens": 2178454.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 135.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044770496897399426, "kl": 0.00046030283556319773, "learning_rate": 1.4983333333333335e-06, "loss": 0.0, "num_tokens": 2178674.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 135.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05637050047516823, "kl": 0.13339848816394806, "learning_rate": 1.497777777777778e-06, "loss": 0.0067, "num_tokens": 2179003.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014882482355460525, "kl": 1.5892088413238525e-05, "learning_rate": 1.4972222222222225e-06, "loss": 0.0, "num_tokens": 2179223.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 135.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.010921690613031387, "kl": 0.25166473537683487, "learning_rate": 1.4966666666666668e-06, "loss": 0.0125, "num_tokens": 2179521.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 135.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.037201087921857834, "kl": 0.020183774642646313, "learning_rate": 1.4961111111111112e-06, "loss": 0.001, "num_tokens": 2179789.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 135.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02799682691693306, "kl": 0.0037245809799060225, "learning_rate": 1.4955555555555557e-06, "loss": 0.0002, "num_tokens": 2180075.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 135.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03439351171255112, "kl": 0.009398476220667362, "learning_rate": 1.495e-06, "loss": 0.0005, "num_tokens": 2180367.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 135.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.2819441854953766, "kl": 0.05308866500854492, "learning_rate": 1.4944444444444444e-06, "loss": 0.0027, "num_tokens": 2180685.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 135.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.024950828403234482, "kl": 0.009906763210892677, "learning_rate": 1.4938888888888888e-06, "loss": 0.0005, "num_tokens": 2180997.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 135.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.7540366649627686, "kl": 0.2190949022769928, "learning_rate": 1.4933333333333336e-06, "loss": 0.1059, "num_tokens": 2181379.0, "reward": 6.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.674234628677368, "step": 7313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 135.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 5.034428119659424, "kl": 0.03239239752292633, "learning_rate": 1.492777777777778e-06, "loss": 0.1057, "num_tokens": 2181662.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 135.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.031913597136735916, "kl": 0.0789363943040371, "learning_rate": 1.4922222222222225e-06, "loss": 0.004, "num_tokens": 2182027.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 135.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.00925350189209, "kl": 0.05768022881238721, "learning_rate": 1.4916666666666669e-06, "loss": 0.1432, "num_tokens": 2182298.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 135.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.04808998480439186, "kl": 0.00983132841065526, "learning_rate": 1.4911111111111112e-06, "loss": 0.0005, "num_tokens": 2182598.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 135.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.023187341168522835, "kl": 0.17278330028057098, "learning_rate": 1.4905555555555556e-06, "loss": 0.0086, "num_tokens": 2182907.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 135.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.04364403337240219, "kl": 0.036905257031321526, "learning_rate": 1.4900000000000001e-06, "loss": 0.0018, "num_tokens": 2183202.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 135.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.169273853302002, "kl": 0.09287399798631668, "learning_rate": 1.4894444444444445e-06, "loss": 0.0611, "num_tokens": 2183562.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 135.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.054502710700035095, "kl": 0.010034669656306505, "learning_rate": 1.4888888888888888e-06, "loss": 0.0005, "num_tokens": 2183834.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.2796548902988434, "kl": 0.08280870318412781, "learning_rate": 1.4883333333333336e-06, "loss": 0.0035, "num_tokens": 2184093.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 135.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.3327103853225708, "kl": 0.043935383670032024, "learning_rate": 1.487777777777778e-06, "loss": 0.0022, "num_tokens": 2184448.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 135.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02109948731958866, "kl": 0.002494834829121828, "learning_rate": 1.4872222222222223e-06, "loss": 0.0001, "num_tokens": 2184756.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 135.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.027951333671808243, "kl": 0.0044674351811409, "learning_rate": 1.486666666666667e-06, "loss": 0.0002, "num_tokens": 2184966.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 135.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.07589644193649292, "kl": 0.03912397311069071, "learning_rate": 1.4861111111111113e-06, "loss": 0.0021, "num_tokens": 2185273.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 135.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.030246933922171593, "kl": 0.003952879458665848, "learning_rate": 1.4855555555555556e-06, "loss": 0.0002, "num_tokens": 2185517.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 135.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.06851641833782196, "kl": 0.018347357399761677, "learning_rate": 1.485e-06, "loss": 0.001, "num_tokens": 2185865.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 135.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.003461727173998952, "kl": 0.28229978680610657, "learning_rate": 1.4844444444444445e-06, "loss": 0.0141, "num_tokens": 2186153.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 135.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.01493521686643362, "kl": 0.004654309479519725, "learning_rate": 1.4838888888888889e-06, "loss": 0.0002, "num_tokens": 2186441.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 135.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04905971512198448, "kl": 0.08136562630534172, "learning_rate": 1.4833333333333337e-06, "loss": 0.004, "num_tokens": 2186799.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 135.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.031569913029670715, "kl": 0.011532099917531013, "learning_rate": 1.482777777777778e-06, "loss": 0.0005, "num_tokens": 2187131.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 135.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.4297561645507812, "kl": 0.046414170414209366, "learning_rate": 1.4822222222222224e-06, "loss": 0.0362, "num_tokens": 2187589.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 7333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 135.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0271986685693264, "kl": 0.004163614008575678, "learning_rate": 1.4816666666666667e-06, "loss": 0.0002, "num_tokens": 2187857.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 135.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0806516781449318, "kl": 0.02543980348855257, "learning_rate": 1.4811111111111113e-06, "loss": 0.0013, "num_tokens": 2188149.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 135.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.019115418195724487, "kl": 0.0017111115157604218, "learning_rate": 1.4805555555555557e-06, "loss": 0.0001, "num_tokens": 2188473.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.004694969393312931, "kl": 0.010236270725727081, "learning_rate": 1.48e-06, "loss": 0.0005, "num_tokens": 2188709.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7337 }, { "clip_ratio/high_max": 0.008064515888690948, "clip_ratio/high_mean": 0.008064515888690948, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008064515888690948, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 135.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.2478222846984863, "kl": 0.18177735805511475, "learning_rate": 1.4794444444444444e-06, "loss": 0.0604, "num_tokens": 2189056.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 135.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.003624641103670001, "kl": 0.0032641030848026276, "learning_rate": 1.478888888888889e-06, "loss": 0.0002, "num_tokens": 2189316.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 135.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.007582315243780613, "kl": 0.0009631514549255371, "learning_rate": 1.4783333333333335e-06, "loss": 0.0, "num_tokens": 2189528.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 135.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 5.0182366371154785, "kl": 0.668925229460001, "learning_rate": 1.477777777777778e-06, "loss": 0.0861, "num_tokens": 2189790.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 135.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.024581637233495712, "kl": 0.0019512549042701721, "learning_rate": 1.4772222222222224e-06, "loss": 0.0001, "num_tokens": 2190052.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 135.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.030143478885293007, "kl": 0.028491176664829254, "learning_rate": 1.4766666666666668e-06, "loss": 0.0014, "num_tokens": 2190268.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 136.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03922262415289879, "kl": 0.013584417290985584, "learning_rate": 1.4761111111111111e-06, "loss": 0.0007, "num_tokens": 2190537.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7344 }, { "clip_ratio/high_max": 0.018518518656492233, "clip_ratio/high_mean": 0.018518518656492233, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018518518656492233, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 136.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.7721199989318848, "kl": 0.19515752792358398, "learning_rate": 1.4755555555555557e-06, "loss": -0.0511, "num_tokens": 2190814.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 7345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.12053679674863815, "kl": 0.013243507943116128, "learning_rate": 1.475e-06, "loss": 0.0007, "num_tokens": 2191110.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.10737114399671555, "kl": 0.00415569543838501, "learning_rate": 1.4744444444444444e-06, "loss": 0.0002, "num_tokens": 2191322.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.015540034510195255, "kl": 0.0010031014680862427, "learning_rate": 1.4738888888888892e-06, "loss": 0.0, "num_tokens": 2191576.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.09555485099554062, "kl": 0.023576066829264164, "learning_rate": 1.4733333333333336e-06, "loss": 0.0012, "num_tokens": 2191854.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 136.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.6524398922920227, "kl": 0.16364783979952335, "learning_rate": 1.472777777777778e-06, "loss": 0.0078, "num_tokens": 2192164.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.057836417108774185, "kl": 0.004366701934486628, "learning_rate": 1.4722222222222225e-06, "loss": 0.0002, "num_tokens": 2192429.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06833767145872116, "kl": 0.01016496866941452, "learning_rate": 1.4716666666666668e-06, "loss": 0.0005, "num_tokens": 2192649.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 1.1307151317596436, "kl": 0.23169080540537834, "learning_rate": 1.4711111111111112e-06, "loss": 0.0109, "num_tokens": 2192952.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 136.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.14355669915676117, "kl": 0.04910988360643387, "learning_rate": 1.4705555555555555e-06, "loss": 0.0023, "num_tokens": 2193300.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 136.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 2.264404535293579, "kl": 0.5616024248301983, "learning_rate": 1.4700000000000001e-06, "loss": 0.0321, "num_tokens": 2193632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 136.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05025370791554451, "kl": 0.029571893624961376, "learning_rate": 1.4694444444444445e-06, "loss": 0.0015, "num_tokens": 2193922.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 136.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04609702154994011, "kl": 0.010735069867223501, "learning_rate": 1.468888888888889e-06, "loss": 0.0006, "num_tokens": 2194250.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 136.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.02098163776099682, "kl": 0.00353066623210907, "learning_rate": 1.4683333333333336e-06, "loss": 0.0002, "num_tokens": 2194510.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 136.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.6197712421417236, "kl": 0.1480445321649313, "learning_rate": 1.467777777777778e-06, "loss": 0.0526, "num_tokens": 2194830.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 136.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.4875545501708984, "kl": 0.1670476272702217, "learning_rate": 1.4672222222222223e-06, "loss": 0.132, "num_tokens": 2195192.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 136.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.1091351509094238, "kl": 0.0752862710505724, "learning_rate": 1.4666666666666669e-06, "loss": -0.1695, "num_tokens": 2195573.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 136.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.04003920033574104, "kl": 0.0027030179044231772, "learning_rate": 1.4661111111111112e-06, "loss": 0.0001, "num_tokens": 2195833.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08440729230642319, "kl": 0.005042665055952966, "learning_rate": 1.4655555555555556e-06, "loss": 0.0002, "num_tokens": 2196054.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0687917098402977, "kl": 0.03460954315960407, "learning_rate": 1.465e-06, "loss": 0.0019, "num_tokens": 2196326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 136.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06932007521390915, "kl": 0.02735055610537529, "learning_rate": 1.4644444444444445e-06, "loss": 0.0014, "num_tokens": 2196661.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 136.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.029099075123667717, "kl": 0.0031268224120140076, "learning_rate": 1.463888888888889e-06, "loss": 0.0001, "num_tokens": 2196871.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 136.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.351689338684082, "kl": 0.21201051771640778, "learning_rate": 1.4633333333333337e-06, "loss": -0.0414, "num_tokens": 2197237.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 136.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.004738462157547474, "kl": 0.00013793110701953992, "learning_rate": 1.462777777777778e-06, "loss": 0.0, "num_tokens": 2197493.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 136.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.020441211760044098, "kl": 0.08040140196681023, "learning_rate": 1.4622222222222224e-06, "loss": 0.004, "num_tokens": 2197921.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 136.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034074061550199986, "kl": 0.2821911871433258, "learning_rate": 1.4616666666666667e-06, "loss": 0.0141, "num_tokens": 2198209.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 136.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.007356319576501846, "kl": 0.0008717775344848633, "learning_rate": 1.4611111111111113e-06, "loss": 0.0, "num_tokens": 2198421.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01866178773343563, "kl": 0.00747508741915226, "learning_rate": 1.4605555555555556e-06, "loss": 0.0004, "num_tokens": 2198695.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 136.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03995149955153465, "kl": 0.0025298953987658024, "learning_rate": 1.46e-06, "loss": 0.0001, "num_tokens": 2199020.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 1.2032420635223389, "kl": 0.2951899570180103, "learning_rate": 1.4594444444444444e-06, "loss": 0.0715, "num_tokens": 2199298.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 136.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00849279947578907, "kl": 0.0018055455875582993, "learning_rate": 1.4588888888888891e-06, "loss": 0.0001, "num_tokens": 2199610.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 136.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.13561208546161652, "kl": 0.018184350105002522, "learning_rate": 1.4583333333333335e-06, "loss": 0.0009, "num_tokens": 2199898.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 136.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 12.179879188537598, "kl": 2.5371987484395504, "learning_rate": 1.457777777777778e-06, "loss": 0.2364, "num_tokens": 2200164.0, "reward": 7.0, "reward_std": 2.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 2.0, "step": 7377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.010891375131905079, "kl": 0.027317926287651062, "learning_rate": 1.4572222222222224e-06, "loss": 0.0014, "num_tokens": 2200380.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 136.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.0908761024475098, "kl": 0.01722109317779541, "learning_rate": 1.4566666666666668e-06, "loss": 0.0013, "num_tokens": 2200711.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 136.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.05306253954768181, "kl": 0.1732923984527588, "learning_rate": 1.4561111111111111e-06, "loss": 0.0086, "num_tokens": 2201022.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 136.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009447266347706318, "kl": 0.00044105592678533867, "learning_rate": 1.4555555555555557e-06, "loss": 0.0, "num_tokens": 2201257.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 136.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.07642372697591782, "kl": 0.008162498474121094, "learning_rate": 1.455e-06, "loss": 0.0004, "num_tokens": 2201541.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 136.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.14196544885635376, "kl": 0.01839529164135456, "learning_rate": 1.4544444444444444e-06, "loss": 0.0009, "num_tokens": 2201825.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 136.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.05191543325781822, "kl": 0.017379973083734512, "learning_rate": 1.4538888888888892e-06, "loss": 0.0009, "num_tokens": 2202122.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 136.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 2.3711869716644287, "kl": 0.14651967585086823, "learning_rate": 1.4533333333333335e-06, "loss": 0.0047, "num_tokens": 2202465.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 7385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 136.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.18765367567539215, "kl": 0.037703718058764935, "learning_rate": 1.4527777777777779e-06, "loss": 0.0021, "num_tokens": 2202716.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 136.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.004976043477654457, "kl": 0.010107621550559998, "learning_rate": 1.4522222222222225e-06, "loss": 0.0005, "num_tokens": 2202952.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 136.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.024367371574044228, "kl": 0.1853683590888977, "learning_rate": 1.4516666666666668e-06, "loss": 0.0093, "num_tokens": 2203236.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 136.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.09929947555065155, "kl": 0.07037895172834396, "learning_rate": 1.4511111111111112e-06, "loss": 0.0034, "num_tokens": 2203552.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 136.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.024260081350803375, "kl": 0.0034699002280831337, "learning_rate": 1.4505555555555555e-06, "loss": 0.0002, "num_tokens": 2203864.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 136.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.04320525750517845, "kl": 0.004037460079416633, "learning_rate": 1.45e-06, "loss": 0.0002, "num_tokens": 2204129.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 136.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.6039458513259888, "kl": 0.26614756882190704, "learning_rate": 1.4494444444444444e-06, "loss": 0.0157, "num_tokens": 2204500.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 7392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 136.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.006658663973212242, "kl": 0.23876313865184784, "learning_rate": 1.4488888888888892e-06, "loss": 0.0119, "num_tokens": 2204800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 136.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.3099477291107178, "kl": 0.20922984182834625, "learning_rate": 1.4483333333333336e-06, "loss": -0.002, "num_tokens": 2205145.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 136.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.022350691258907318, "kl": 0.048244534060359, "learning_rate": 1.447777777777778e-06, "loss": 0.0024, "num_tokens": 2205597.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 136.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.059311412274837494, "kl": 0.017900697886943817, "learning_rate": 1.4472222222222223e-06, "loss": 0.0009, "num_tokens": 2205890.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 136.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08092915266752243, "kl": 0.020711537450551987, "learning_rate": 1.4466666666666669e-06, "loss": 0.0011, "num_tokens": 2206164.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 137.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.019344164058566093, "kl": 0.013957891147583723, "learning_rate": 1.4461111111111112e-06, "loss": 0.0007, "num_tokens": 2206424.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 137.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.9102940559387207, "kl": 0.0840768963098526, "learning_rate": 1.4455555555555556e-06, "loss": 0.0047, "num_tokens": 2206794.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 7399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 137.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07379863411188126, "kl": 0.04359552636742592, "learning_rate": 1.445e-06, "loss": 0.0025, "num_tokens": 2207073.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.25, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 137.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.2701539993286133, "kl": 0.04374916851520538, "learning_rate": 1.4444444444444445e-06, "loss": 0.3616, "num_tokens": 2207522.0, "reward": 6.25, "reward_std": 2.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 2.5, "step": 7401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 137.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.18513983488082886, "kl": 0.01585161779075861, "learning_rate": 1.443888888888889e-06, "loss": 0.0008, "num_tokens": 2207792.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 137.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 4.625370502471924, "kl": 0.21748222410678864, "learning_rate": 1.4433333333333336e-06, "loss": 0.1895, "num_tokens": 2208116.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 137.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.004046469461172819, "kl": 0.00036016106605529785, "learning_rate": 1.442777777777778e-06, "loss": 0.0, "num_tokens": 2208336.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 137.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.018184373155236244, "kl": 0.0008678081794641912, "learning_rate": 1.4422222222222223e-06, "loss": 0.0, "num_tokens": 2208570.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 137.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.12030137330293655, "kl": 0.01557551883161068, "learning_rate": 1.4416666666666667e-06, "loss": 0.0008, "num_tokens": 2208858.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 137.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.697286367416382, "kl": 0.16296416381374002, "learning_rate": 1.4411111111111113e-06, "loss": 0.1171, "num_tokens": 2209189.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 137.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015338108642026782, "kl": 1.9028782844543457e-05, "learning_rate": 1.4405555555555556e-06, "loss": 0.0, "num_tokens": 2209409.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 137.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 5.065679550170898, "kl": 0.04440948273986578, "learning_rate": 1.44e-06, "loss": 0.3418, "num_tokens": 2209677.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 137.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224311351776123, "kl": 0.010079814586788416, "learning_rate": 1.4394444444444448e-06, "loss": 0.0005, "num_tokens": 2209944.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 137.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.531658172607422, "kl": 0.19938341714441776, "learning_rate": 1.4388888888888891e-06, "loss": 0.1264, "num_tokens": 2210246.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 7411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 137.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.02451177127659321, "kl": 0.0013988212449476123, "learning_rate": 1.4383333333333335e-06, "loss": 0.0001, "num_tokens": 2210507.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 137.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.1936421394348145, "kl": 0.10503705218434334, "learning_rate": 1.437777777777778e-06, "loss": -0.0225, "num_tokens": 2210809.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 137.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06811372935771942, "kl": 0.010646353708580136, "learning_rate": 1.4372222222222224e-06, "loss": 0.0005, "num_tokens": 2211097.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 137.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.011704860255122185, "kl": 0.0018689849530346692, "learning_rate": 1.4366666666666667e-06, "loss": 0.0001, "num_tokens": 2211393.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 137.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 4.28645658493042, "kl": 0.040023354813456535, "learning_rate": 1.436111111111111e-06, "loss": 0.0874, "num_tokens": 2211739.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 137.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09193812310695648, "kl": 0.017855477519333363, "learning_rate": 1.4355555555555557e-06, "loss": 0.0009, "num_tokens": 2212073.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 137.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0417935810983181, "kl": 0.23719967156648636, "learning_rate": 1.435e-06, "loss": 0.0118, "num_tokens": 2212374.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 137.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.029195761308073997, "kl": 0.149742491543293, "learning_rate": 1.4344444444444446e-06, "loss": 0.0076, "num_tokens": 2212682.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 137.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.778493881225586, "kl": 0.0150480424053967, "learning_rate": 1.4338888888888892e-06, "loss": 0.0257, "num_tokens": 2212977.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 137.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.3246750831604004, "kl": 0.20078035444021225, "learning_rate": 1.4333333333333335e-06, "loss": 0.0211, "num_tokens": 2213358.0, "reward": 1.875, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 1.875, "rewards/reward_combined/std": 4.308422088623047, "step": 7421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 137.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 1.670393705368042, "kl": 0.08662048354744911, "learning_rate": 1.4327777777777779e-06, "loss": 0.0046, "num_tokens": 2213718.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 7422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 137.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.011591038666665554, "kl": 0.003244701772928238, "learning_rate": 1.4322222222222224e-06, "loss": 0.0002, "num_tokens": 2213978.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 137.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07791054993867874, "kl": 0.02285378985106945, "learning_rate": 1.4316666666666668e-06, "loss": 0.0011, "num_tokens": 2214311.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 137.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.013610085472464561, "kl": 0.0013027042150497437, "learning_rate": 1.4311111111111111e-06, "loss": 0.0001, "num_tokens": 2214555.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 137.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.012988455593585968, "kl": 0.000342005499987863, "learning_rate": 1.4305555555555555e-06, "loss": 0.0, "num_tokens": 2214811.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 137.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05640748515725136, "kl": 0.009305561427026987, "learning_rate": 1.43e-06, "loss": 0.0004, "num_tokens": 2215095.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7427 }, { "clip_ratio/high_max": 0.017241379246115685, "clip_ratio/high_mean": 0.017241379246115685, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017241379246115685, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 137.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 7.066039562225342, "kl": 0.04309556633234024, "learning_rate": 1.4294444444444446e-06, "loss": -0.0565, "num_tokens": 2215390.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 137.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.040219515562057495, "kl": 0.012612413614988327, "learning_rate": 1.4288888888888892e-06, "loss": 0.0006, "num_tokens": 2215690.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 137.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.014010502025485039, "kl": 0.002610691823065281, "learning_rate": 1.4283333333333336e-06, "loss": 0.0001, "num_tokens": 2215967.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 137.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1404745876789093, "kl": 0.08609815686941147, "learning_rate": 1.427777777777778e-06, "loss": 0.004, "num_tokens": 2216283.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 137.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020263055339455605, "kl": 0.0013620652607642114, "learning_rate": 1.4272222222222223e-06, "loss": 0.0001, "num_tokens": 2216602.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 137.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01959013007581234, "kl": 0.013965752441436052, "learning_rate": 1.4266666666666668e-06, "loss": 0.0007, "num_tokens": 2216862.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 137.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.024753734469413757, "kl": 0.0006289705634117126, "learning_rate": 1.4261111111111112e-06, "loss": 0.0, "num_tokens": 2217074.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 137.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.8022823333740234, "kl": 0.08507521450519562, "learning_rate": 1.4255555555555556e-06, "loss": 0.0061, "num_tokens": 2217284.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 137.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0470011867582798, "kl": 0.03395429439842701, "learning_rate": 1.425e-06, "loss": 0.0018, "num_tokens": 2217637.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 137.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03296223282814026, "kl": 0.006838690402219072, "learning_rate": 1.4244444444444447e-06, "loss": 0.0004, "num_tokens": 2217899.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 137.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.027619101107120514, "kl": 0.008388743037357926, "learning_rate": 1.423888888888889e-06, "loss": 0.0004, "num_tokens": 2218173.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 137.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.013718240894377232, "kl": 0.00443073344649747, "learning_rate": 1.4233333333333336e-06, "loss": 0.0002, "num_tokens": 2218439.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 137.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.005545582622289658, "kl": 0.00032955408096313477, "learning_rate": 1.422777777777778e-06, "loss": 0.0, "num_tokens": 2218651.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 137.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.2024289071559906, "kl": 0.12086261808872223, "learning_rate": 1.4222222222222223e-06, "loss": 0.0061, "num_tokens": 2219088.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 137.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.008472100831568241, "kl": 0.007894372567534447, "learning_rate": 1.4216666666666667e-06, "loss": 0.0004, "num_tokens": 2219400.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 137.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.024468598887324333, "kl": 0.04638317599892616, "learning_rate": 1.4211111111111112e-06, "loss": 0.0023, "num_tokens": 2219869.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 137.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00457670446485281, "kl": 0.010159626603126526, "learning_rate": 1.4205555555555556e-06, "loss": 0.0005, "num_tokens": 2220105.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7444 }, { "clip_ratio/high_max": 0.016129031777381897, "clip_ratio/high_mean": 0.016129031777381897, "clip_ratio/low_mean": 0.016129031777381897, "clip_ratio/low_min": 0.016129031777381897, "clip_ratio/region_mean": 0.032258063554763794, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 137.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.6173574924468994, "kl": 0.03187138168141246, "learning_rate": 1.42e-06, "loss": 0.0113, "num_tokens": 2220406.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 137.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.23390920460224152, "kl": 0.1487746238708496, "learning_rate": 1.4194444444444447e-06, "loss": 0.0074, "num_tokens": 2220730.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 137.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.005184221547096968, "kl": 0.28168362379074097, "learning_rate": 1.418888888888889e-06, "loss": 0.0141, "num_tokens": 2221018.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 137.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07905510812997818, "kl": 0.16547538340091705, "learning_rate": 1.4183333333333334e-06, "loss": 0.0083, "num_tokens": 2221368.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 137.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.027460366487503052, "kl": 0.021414787508547306, "learning_rate": 1.417777777777778e-06, "loss": 0.0009, "num_tokens": 2221753.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 137.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01265111193060875, "kl": 0.0008432579743384849, "learning_rate": 1.4172222222222224e-06, "loss": 0.0, "num_tokens": 2222017.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 137.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0362669974565506, "kl": 0.035721711814403534, "learning_rate": 1.4166666666666667e-06, "loss": 0.0018, "num_tokens": 2222313.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 138.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.051252298057079315, "kl": 0.0033452032366767526, "learning_rate": 1.416111111111111e-06, "loss": 0.0002, "num_tokens": 2222629.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 138.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.024677859619259834, "kl": 0.0014426402049139142, "learning_rate": 1.4155555555555556e-06, "loss": 0.0001, "num_tokens": 2222892.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 138.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.11425009369850159, "kl": 0.038581233471632004, "learning_rate": 1.415e-06, "loss": 0.002, "num_tokens": 2223222.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 138.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.002211465500295162, "kl": 0.0017832752782851458, "learning_rate": 1.4144444444444446e-06, "loss": 0.0001, "num_tokens": 2223502.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 138.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04939829185605049, "kl": 0.005479718558490276, "learning_rate": 1.4138888888888891e-06, "loss": 0.0003, "num_tokens": 2223804.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 138.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.012840831652283669, "kl": 0.003041554009541869, "learning_rate": 1.4133333333333335e-06, "loss": 0.0002, "num_tokens": 2224088.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 138.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.6606783866882324, "kl": 0.2857631742954254, "learning_rate": 1.4127777777777779e-06, "loss": 0.162, "num_tokens": 2224405.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 138.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.8497872352600098, "kl": 0.17099870461970568, "learning_rate": 1.4122222222222224e-06, "loss": 0.0781, "num_tokens": 2224703.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 138.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04020872339606285, "kl": 0.011307629756629467, "learning_rate": 1.4116666666666668e-06, "loss": 0.0006, "num_tokens": 2224990.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 138.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 4.4007182121276855, "kl": 0.17981489142403007, "learning_rate": 1.4111111111111111e-06, "loss": 0.0119, "num_tokens": 2225266.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 7461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 138.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.010607182048261166, "kl": 0.0015426799654960632, "learning_rate": 1.4105555555555555e-06, "loss": 0.0001, "num_tokens": 2225510.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 138.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1811276376247406, "kl": 0.027253206819295883, "learning_rate": 1.41e-06, "loss": 0.0014, "num_tokens": 2225804.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 138.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.005136036314070225, "kl": 0.2817169278860092, "learning_rate": 1.4094444444444446e-06, "loss": 0.0141, "num_tokens": 2226092.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 138.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.05733634531497955, "kl": 0.009598626289516687, "learning_rate": 1.4088888888888892e-06, "loss": 0.0005, "num_tokens": 2226421.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015931802045088261, "kl": 2.2314488887786865e-05, "learning_rate": 1.4083333333333335e-06, "loss": 0.0, "num_tokens": 2226641.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 138.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.5677051544189453, "kl": 0.08338806964457035, "learning_rate": 1.407777777777778e-06, "loss": 0.0334, "num_tokens": 2227023.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 7467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 138.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.2562639117240906, "kl": 0.06498626573011279, "learning_rate": 1.4072222222222223e-06, "loss": 0.0039, "num_tokens": 2227298.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 138.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.024420153349637985, "kl": 0.003484316752292216, "learning_rate": 1.4066666666666668e-06, "loss": 0.0002, "num_tokens": 2227594.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 138.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 3.1167197227478027, "kl": 0.10771218501031399, "learning_rate": 1.4061111111111112e-06, "loss": 0.107, "num_tokens": 2227932.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 138.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.034297239035367966, "kl": 0.001857916300650686, "learning_rate": 1.4055555555555555e-06, "loss": 0.0001, "num_tokens": 2228190.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 138.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.11646824330091476, "kl": 0.023543273098766804, "learning_rate": 1.4050000000000003e-06, "loss": 0.001, "num_tokens": 2228471.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 138.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.003898225026205182, "kl": 0.0001265406608581543, "learning_rate": 1.4044444444444447e-06, "loss": 0.0, "num_tokens": 2228683.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 138.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.26109588146209717, "kl": 0.1974712610244751, "learning_rate": 1.403888888888889e-06, "loss": 0.0099, "num_tokens": 2228967.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 138.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016469666734337807, "kl": 0.0013719398993998766, "learning_rate": 1.4033333333333336e-06, "loss": 0.0001, "num_tokens": 2229285.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 7.454337120056152, "kl": 0.03223798982799053, "learning_rate": 1.402777777777778e-06, "loss": 0.3428, "num_tokens": 2229520.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 138.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.779857635498047, "kl": 0.3870570370927453, "learning_rate": 1.4022222222222223e-06, "loss": 0.0954, "num_tokens": 2229865.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 7477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045860237441957, "kl": 0.010168075561523438, "learning_rate": 1.4016666666666667e-06, "loss": 0.0005, "num_tokens": 2230101.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 138.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05864043906331062, "kl": 0.018396910279989243, "learning_rate": 1.4011111111111112e-06, "loss": 0.0009, "num_tokens": 2230391.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 138.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08885251730680466, "kl": 0.08409880101680756, "learning_rate": 1.4005555555555556e-06, "loss": 0.0041, "num_tokens": 2230745.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 138.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.014051254838705063, "kl": 0.02042676880955696, "learning_rate": 1.4000000000000001e-06, "loss": 0.001, "num_tokens": 2231095.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 138.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.008719266392290592, "kl": 0.008946296758949757, "learning_rate": 1.3994444444444447e-06, "loss": 0.0004, "num_tokens": 2231367.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 138.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04080329090356827, "kl": 0.016475300304591656, "learning_rate": 1.398888888888889e-06, "loss": 0.0008, "num_tokens": 2231684.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 138.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.035882193595170975, "kl": 0.0049165242817252874, "learning_rate": 1.3983333333333334e-06, "loss": 0.0002, "num_tokens": 2231995.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 138.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.09013732522726059, "kl": 0.08571912348270416, "learning_rate": 1.397777777777778e-06, "loss": 0.0044, "num_tokens": 2232315.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.05178727209568024, "kl": 0.003202712454367429, "learning_rate": 1.3972222222222224e-06, "loss": 0.0001, "num_tokens": 2232569.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 138.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.019434306770563126, "kl": 0.004447643324965611, "learning_rate": 1.3966666666666667e-06, "loss": 0.0003, "num_tokens": 2232833.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 138.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.02647077850997448, "kl": 0.03444714844226837, "learning_rate": 1.396111111111111e-06, "loss": 0.0019, "num_tokens": 2233105.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 138.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.2823564410209656, "kl": 0.10107794217765331, "learning_rate": 1.3955555555555556e-06, "loss": 0.0054, "num_tokens": 2233405.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 138.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04120442643761635, "kl": 0.06585084646940231, "learning_rate": 1.3950000000000002e-06, "loss": 0.0033, "num_tokens": 2233702.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05902231112122536, "kl": 0.0029513821937143803, "learning_rate": 1.3944444444444446e-06, "loss": 0.0002, "num_tokens": 2233915.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 138.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.4885314404964447, "kl": 0.19887521862983704, "learning_rate": 1.3938888888888891e-06, "loss": 0.0104, "num_tokens": 2234304.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 138.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.9282777309417725, "kl": 0.04860502481460571, "learning_rate": 1.3933333333333335e-06, "loss": -0.0501, "num_tokens": 2234762.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 7493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 138.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.022292565554380417, "kl": 0.16912420839071274, "learning_rate": 1.3927777777777778e-06, "loss": 0.0084, "num_tokens": 2235072.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 138.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.017533037811517715, "kl": 0.008538148365914822, "learning_rate": 1.3922222222222224e-06, "loss": 0.0004, "num_tokens": 2235377.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.06493455171585083, "kl": 0.012130178685765713, "learning_rate": 1.3916666666666668e-06, "loss": 0.0009, "num_tokens": 2235604.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 138.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03787020593881607, "kl": 0.012614680919796228, "learning_rate": 1.3911111111111111e-06, "loss": 0.0006, "num_tokens": 2235865.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 138.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01054032240062952, "kl": 0.008250085636973381, "learning_rate": 1.3905555555555555e-06, "loss": 0.0004, "num_tokens": 2236177.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 138.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.012719101272523403, "kl": 0.08608448505401611, "learning_rate": 1.3900000000000002e-06, "loss": 0.0042, "num_tokens": 2236598.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 138.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.12739963829517365, "kl": 0.015005469787865877, "learning_rate": 1.3894444444444446e-06, "loss": 0.0009, "num_tokens": 2236842.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 138.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.004330046474933624, "kl": 0.00022050738334655762, "learning_rate": 1.3888888888888892e-06, "loss": 0.0, "num_tokens": 2237098.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 138.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.23020349442958832, "kl": 0.05146658467128873, "learning_rate": 1.3883333333333335e-06, "loss": 0.0027, "num_tokens": 2237431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 138.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.10982006788253784, "kl": 0.006188948173075914, "learning_rate": 1.3877777777777779e-06, "loss": 0.0003, "num_tokens": 2237645.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 138.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.5178356170654297, "kl": 0.21324704587459564, "learning_rate": 1.3872222222222222e-06, "loss": 0.0871, "num_tokens": 2237998.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 7504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 138.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.8275254964828491, "kl": 0.2483626753091812, "learning_rate": 1.3866666666666668e-06, "loss": 0.0132, "num_tokens": 2238345.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 139.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08403396606445312, "kl": 0.00781862810254097, "learning_rate": 1.3861111111111112e-06, "loss": 0.0004, "num_tokens": 2238605.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 139.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.11015898734331131, "kl": 0.01666840660618618, "learning_rate": 1.3855555555555555e-06, "loss": 0.0008, "num_tokens": 2238914.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 139.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008055220358073711, "kl": 0.009082799777388573, "learning_rate": 1.3850000000000003e-06, "loss": 0.0005, "num_tokens": 2239186.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 139.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.1272338330745697, "kl": 0.009518206119537354, "learning_rate": 1.3844444444444446e-06, "loss": 0.0005, "num_tokens": 2239398.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 139.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.6895523071289062, "kl": 0.12866017827764153, "learning_rate": 1.383888888888889e-06, "loss": -0.0167, "num_tokens": 2239734.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 139.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0167455542832613, "kl": 0.0009982479678001255, "learning_rate": 1.3833333333333336e-06, "loss": 0.0, "num_tokens": 2240002.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 139.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 5.514046669006348, "kl": 0.2491222694516182, "learning_rate": 1.382777777777778e-06, "loss": 0.0855, "num_tokens": 2240319.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 139.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.543398857116699, "kl": 0.08181217685341835, "learning_rate": 1.3822222222222223e-06, "loss": 0.054, "num_tokens": 2240611.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 139.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06797277182340622, "kl": 0.017128555569797754, "learning_rate": 1.3816666666666666e-06, "loss": 0.0009, "num_tokens": 2240932.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 139.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.06812111288309097, "kl": 0.010299778543412685, "learning_rate": 1.3811111111111112e-06, "loss": 0.0005, "num_tokens": 2241214.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 139.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.003477399470284581, "kl": 0.00024762749671936035, "learning_rate": 1.3805555555555556e-06, "loss": 0.0, "num_tokens": 2241470.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 139.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.016205184161663055, "kl": 0.003455040365224704, "learning_rate": 1.3800000000000001e-06, "loss": 0.0002, "num_tokens": 2241738.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 139.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.4919955730438232, "kl": 0.3713112473487854, "learning_rate": 1.3794444444444447e-06, "loss": -0.0402, "num_tokens": 2242019.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 139.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0427195243537426, "kl": 0.009572797920554876, "learning_rate": 1.378888888888889e-06, "loss": 0.0005, "num_tokens": 2242349.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 139.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.07044818997383118, "kl": 0.0047075479524210095, "learning_rate": 1.3783333333333334e-06, "loss": 0.0002, "num_tokens": 2242610.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 139.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.018596110865473747, "kl": 0.0031680805259384215, "learning_rate": 1.377777777777778e-06, "loss": 0.0002, "num_tokens": 2242898.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 139.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06526416540145874, "kl": 0.014740831218659878, "learning_rate": 1.3772222222222223e-06, "loss": 0.0008, "num_tokens": 2243179.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 139.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07110103219747543, "kl": 0.025996368378400803, "learning_rate": 1.3766666666666667e-06, "loss": 0.0012, "num_tokens": 2243406.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 139.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.011677158996462822, "kl": 0.0014429744333028793, "learning_rate": 1.376111111111111e-06, "loss": 0.0001, "num_tokens": 2243729.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 139.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.030097300186753273, "kl": 0.015768670476973057, "learning_rate": 1.3755555555555556e-06, "loss": 0.0008, "num_tokens": 2243997.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 139.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.00383553234860301, "kl": 8.783489465713501e-05, "learning_rate": 1.3750000000000002e-06, "loss": 0.0, "num_tokens": 2244209.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 139.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.025701744481921196, "kl": 0.023229573387652636, "learning_rate": 1.3744444444444445e-06, "loss": 0.0011, "num_tokens": 2244554.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 139.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.345607280731201, "kl": 0.1543029546737671, "learning_rate": 1.373888888888889e-06, "loss": 0.1752, "num_tokens": 2244928.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 7528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 139.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.99615478515625, "kl": 0.1048823818564415, "learning_rate": 1.3733333333333335e-06, "loss": 0.0927, "num_tokens": 2245244.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 7529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 139.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02169402875006199, "kl": 0.0012145958608016372, "learning_rate": 1.3727777777777778e-06, "loss": 0.0001, "num_tokens": 2245502.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015384615398943424, "clip_ratio/low_min": 0.015384615398943424, "clip_ratio/region_mean": 0.015384615398943424, "completion_length": 88.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 88.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 139.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.9352965354919434, "kl": 0.3103950172662735, "learning_rate": 1.3722222222222224e-06, "loss": 0.4571, "num_tokens": 2246078.0, "reward": 4.175000190734863, "reward_std": 5.11101770401001, "rewards/reward_combined/mean": 4.175000190734863, "rewards/reward_combined/std": 5.11101770401001, "step": 7531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 139.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.10977271944284439, "kl": 0.08339604362845421, "learning_rate": 1.3716666666666667e-06, "loss": 0.004, "num_tokens": 2246441.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 139.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.09294720739126205, "kl": 0.03325286915060133, "learning_rate": 1.371111111111111e-06, "loss": 0.0018, "num_tokens": 2246739.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 139.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.031792331486940384, "kl": 0.04490877315402031, "learning_rate": 1.3705555555555559e-06, "loss": 0.0022, "num_tokens": 2247207.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 139.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.2712104320526123, "kl": 0.10089670121669769, "learning_rate": 1.3700000000000002e-06, "loss": 0.3384, "num_tokens": 2247585.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 139.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.1445525586605072, "kl": 0.021616362035274506, "learning_rate": 1.3694444444444446e-06, "loss": 0.0011, "num_tokens": 2247877.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 139.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07643215358257294, "kl": 0.015816733706742525, "learning_rate": 1.3688888888888891e-06, "loss": 0.0008, "num_tokens": 2248205.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 139.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.06252441555261612, "kl": 0.02019091136753559, "learning_rate": 1.3683333333333335e-06, "loss": 0.001, "num_tokens": 2248500.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 139.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01373893953859806, "kl": 0.002576087135821581, "learning_rate": 1.3677777777777779e-06, "loss": 0.0001, "num_tokens": 2248777.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 139.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.08041368424892426, "kl": 0.021459692157804966, "learning_rate": 1.3672222222222222e-06, "loss": 0.0013, "num_tokens": 2249049.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 139.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.6206173896789551, "kl": 0.24753564596176147, "learning_rate": 1.3666666666666668e-06, "loss": 0.0127, "num_tokens": 2249337.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 139.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.036406341940164566, "kl": 0.0027797818183898926, "learning_rate": 1.3661111111111111e-06, "loss": 0.0001, "num_tokens": 2249543.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 139.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07180456072092056, "kl": 0.28600814938545227, "learning_rate": 1.3655555555555557e-06, "loss": 0.0143, "num_tokens": 2249831.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 139.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04761592298746109, "kl": 0.014122267719358206, "learning_rate": 1.3650000000000003e-06, "loss": 0.0007, "num_tokens": 2250130.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 139.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075411489233374596, "kl": 0.0005974875821266323, "learning_rate": 1.3644444444444446e-06, "loss": 0.0, "num_tokens": 2250364.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 139.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.012809357605874538, "kl": 0.002316651400178671, "learning_rate": 1.363888888888889e-06, "loss": 0.0001, "num_tokens": 2250648.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 139.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.051851630210876465, "kl": 0.13336174190044403, "learning_rate": 1.3633333333333336e-06, "loss": 0.0067, "num_tokens": 2250979.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 139.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.017130928114056587, "kl": 0.0016154751065187156, "learning_rate": 1.362777777777778e-06, "loss": 0.0001, "num_tokens": 2251198.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 139.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.018943678587675095, "kl": 0.003291770815849304, "learning_rate": 1.3622222222222223e-06, "loss": 0.0002, "num_tokens": 2251458.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 139.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.3303847312927246, "kl": 0.16387420147657394, "learning_rate": 1.3616666666666666e-06, "loss": -0.0755, "num_tokens": 2251871.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 7550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 139.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.004696832969784737, "kl": 0.010140344500541687, "learning_rate": 1.3611111111111112e-06, "loss": 0.0005, "num_tokens": 2252107.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 139.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04635066166520119, "kl": 0.23789452016353607, "learning_rate": 1.3605555555555558e-06, "loss": 0.0118, "num_tokens": 2252408.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 139.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.1784685254096985, "kl": 0.09765370562672615, "learning_rate": 1.3600000000000001e-06, "loss": 0.0049, "num_tokens": 2252848.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 139.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04068175330758095, "kl": 0.0050275607500225306, "learning_rate": 1.3594444444444447e-06, "loss": 0.0003, "num_tokens": 2253156.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 139.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.13575343787670135, "kl": 0.02728920429944992, "learning_rate": 1.358888888888889e-06, "loss": 0.0015, "num_tokens": 2253407.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 139.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04303482174873352, "kl": 0.009320146404206753, "learning_rate": 1.3583333333333334e-06, "loss": 0.0005, "num_tokens": 2253667.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 139.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.2199338674545288, "kl": 0.025304872542619705, "learning_rate": 1.357777777777778e-06, "loss": 0.0015, "num_tokens": 2253988.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 139.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015988381346687675, "kl": 2.1167099475860596e-05, "learning_rate": 1.3572222222222223e-06, "loss": 0.0, "num_tokens": 2254208.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 139.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07329762727022171, "kl": 0.01619691774249077, "learning_rate": 1.3566666666666667e-06, "loss": 0.0008, "num_tokens": 2254542.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 140.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0458751916885376, "kl": 0.030460692942142487, "learning_rate": 1.356111111111111e-06, "loss": 0.0015, "num_tokens": 2254902.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 140.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.516070365905762, "kl": 0.012306715128943324, "learning_rate": 1.3555555555555558e-06, "loss": 0.0663, "num_tokens": 2255202.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 140.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.010489845648407936, "kl": 0.0759279727935791, "learning_rate": 1.3550000000000002e-06, "loss": 0.0038, "num_tokens": 2255638.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 140.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0885261669754982, "kl": 0.09317052736878395, "learning_rate": 1.3544444444444445e-06, "loss": 0.0047, "num_tokens": 2256026.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 140.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.009798653423786163, "kl": 0.0007139467052184045, "learning_rate": 1.353888888888889e-06, "loss": 0.0, "num_tokens": 2256289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 140.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 2.6995527744293213, "kl": 1.3078058594837785, "learning_rate": 1.3533333333333334e-06, "loss": 0.0945, "num_tokens": 2256550.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 140.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.009204644709825516, "kl": 0.17338433116674423, "learning_rate": 1.3527777777777778e-06, "loss": 0.0087, "num_tokens": 2256858.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 140.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.13389845192432404, "kl": 0.4480697810649872, "learning_rate": 1.3522222222222224e-06, "loss": 0.0224, "num_tokens": 2257142.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 140.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 3.8735153675079346, "kl": 0.5935864746570587, "learning_rate": 1.3516666666666667e-06, "loss": 0.0317, "num_tokens": 2257501.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 140.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.09940340369939804, "kl": 0.028568672947585583, "learning_rate": 1.351111111111111e-06, "loss": 0.0015, "num_tokens": 2257837.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 140.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.045145433396101, "kl": 0.005109386285766959, "learning_rate": 1.3505555555555558e-06, "loss": 0.0002, "num_tokens": 2258119.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 140.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.9789059162139893, "kl": 0.4086389932781458, "learning_rate": 1.3500000000000002e-06, "loss": 0.0417, "num_tokens": 2258433.0, "reward": 3.75, "reward_std": 5.057997226715088, "rewards/reward_combined/mean": 3.75, "rewards/reward_combined/std": 5.057997226715088, "step": 7571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 140.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014779790304601192, "kl": 0.0031699873507022858, "learning_rate": 1.3494444444444446e-06, "loss": 0.0002, "num_tokens": 2258693.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 140.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.010579735971987247, "kl": 0.0017144754528999329, "learning_rate": 1.3488888888888891e-06, "loss": 0.0001, "num_tokens": 2258937.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 140.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.1608457714319229, "kl": 0.0669605452567339, "learning_rate": 1.3483333333333335e-06, "loss": 0.0032, "num_tokens": 2259301.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 140.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.055083490908145905, "kl": 0.010239110881229863, "learning_rate": 1.3477777777777778e-06, "loss": 0.0005, "num_tokens": 2259567.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 140.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03896003216505051, "kl": 0.20271407812833786, "learning_rate": 1.3472222222222222e-06, "loss": 0.0095, "num_tokens": 2259897.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 140.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.757214307785034, "kl": 0.03744233679026365, "learning_rate": 1.3466666666666668e-06, "loss": 0.0033, "num_tokens": 2260231.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 140.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.10195822268724442, "kl": 0.022200881503522396, "learning_rate": 1.3461111111111111e-06, "loss": 0.0011, "num_tokens": 2260565.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 140.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.9607194662094116, "kl": 0.12560562044382095, "learning_rate": 1.3455555555555557e-06, "loss": 0.0898, "num_tokens": 2260878.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 140.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.005008919630199671, "kl": 0.010096810758113861, "learning_rate": 1.3450000000000003e-06, "loss": 0.0005, "num_tokens": 2261114.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 74.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 74.75, "completions/mean_terminated_length": 14.333333969116211, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 140.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.6526912450790405, "kl": 0.021088379435241222, "learning_rate": 1.3444444444444446e-06, "loss": 0.4518, "num_tokens": 2261645.0, "reward": 4.925000190734863, "reward_std": 5.150000095367432, "rewards/reward_combined/mean": 4.925000190734863, "rewards/reward_combined/std": 5.150000095367432, "step": 7581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 140.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 1.9284673929214478, "kl": 0.3720803838223219, "learning_rate": 1.343888888888889e-06, "loss": 0.0182, "num_tokens": 2262104.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 140.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.07187484949827194, "kl": 0.002595454454421997, "learning_rate": 1.3433333333333335e-06, "loss": 0.0002, "num_tokens": 2262320.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 140.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.01378275640308857, "kl": 0.0026229667710140347, "learning_rate": 1.3427777777777779e-06, "loss": 0.0001, "num_tokens": 2262597.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 140.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.003327698679640889, "kl": 0.0014006216661073267, "learning_rate": 1.3422222222222222e-06, "loss": 0.0001, "num_tokens": 2262916.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 140.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04210834205150604, "kl": 0.02097201207652688, "learning_rate": 1.3416666666666666e-06, "loss": 0.001, "num_tokens": 2263207.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 140.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.009718677029013634, "kl": 0.21399284899234772, "learning_rate": 1.3411111111111112e-06, "loss": 0.0107, "num_tokens": 2263511.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 140.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.010862105526030064, "kl": 0.006369492504745722, "learning_rate": 1.3405555555555557e-06, "loss": 0.0003, "num_tokens": 2263784.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7588 }, { "clip_ratio/high_max": 0.013513513840734959, "clip_ratio/high_mean": 0.013513513840734959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 140.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.326359748840332, "kl": 0.24651286751031876, "learning_rate": 1.34e-06, "loss": 0.0543, "num_tokens": 2264076.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 140.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.8263092041015625, "kl": 0.11935823783278465, "learning_rate": 1.3394444444444447e-06, "loss": -0.0011, "num_tokens": 2264423.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 140.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034813808742910624, "kl": 0.2820576876401901, "learning_rate": 1.338888888888889e-06, "loss": 0.0141, "num_tokens": 2264711.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 140.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.015458520501852036, "kl": 0.003574945032596588, "learning_rate": 1.3383333333333334e-06, "loss": 0.0002, "num_tokens": 2264919.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 140.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0887407585978508, "kl": 0.020685535855591297, "learning_rate": 1.337777777777778e-06, "loss": 0.0011, "num_tokens": 2265197.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 140.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.31109586358070374, "kl": 0.0622264351695776, "learning_rate": 1.3372222222222223e-06, "loss": 0.0027, "num_tokens": 2265563.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 140.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.006151859182864428, "kl": 0.001224622130393982, "learning_rate": 1.3366666666666666e-06, "loss": 0.0001, "num_tokens": 2265823.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 140.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.014622705057263374, "kl": 0.02802111953496933, "learning_rate": 1.3361111111111114e-06, "loss": 0.0014, "num_tokens": 2266039.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001879699295386672, "clip_ratio/low_min": 0.001879699295386672, "clip_ratio/region_mean": 0.001879699295386672, "completion_length": 77.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 77.0, "completions/mean_terminated_length": 17.33333396911621, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 140.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.592824935913086, "kl": 0.042215841356664896, "learning_rate": 1.3355555555555558e-06, "loss": 0.4671, "num_tokens": 2266567.0, "reward": 6.800000190734863, "reward_std": 1.4000000953674316, "rewards/reward_combined/mean": 6.800000190734863, "rewards/reward_combined/std": 1.4000000953674316, "step": 7597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 140.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.009371922351419926, "kl": 0.003583682468160987, "learning_rate": 1.3350000000000001e-06, "loss": 0.0002, "num_tokens": 2266855.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 140.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.052663423120975494, "kl": 0.0017877995851449668, "learning_rate": 1.3344444444444447e-06, "loss": 0.0001, "num_tokens": 2267068.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 140.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.017687121406197548, "kl": 0.06544274836778641, "learning_rate": 1.333888888888889e-06, "loss": 0.0032, "num_tokens": 2267438.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 140.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.036772798746824265, "kl": 0.008643870707601309, "learning_rate": 1.3333333333333334e-06, "loss": 0.0004, "num_tokens": 2267758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 140.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032836757600307465, "kl": 0.00012795626753359102, "learning_rate": 1.3327777777777778e-06, "loss": 0.0, "num_tokens": 2268014.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 140.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.2530781328678131, "kl": 0.04517043847590685, "learning_rate": 1.3322222222222223e-06, "loss": 0.0024, "num_tokens": 2268309.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 140.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.12595945596694946, "kl": 0.035881513729691505, "learning_rate": 1.3316666666666667e-06, "loss": 0.0019, "num_tokens": 2268613.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 140.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015628262190148234, "kl": 2.0273029804229736e-05, "learning_rate": 1.3311111111111113e-06, "loss": 0.0, "num_tokens": 2268833.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 140.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.2141455113887787, "kl": 0.06731974519789219, "learning_rate": 1.3305555555555558e-06, "loss": 0.0034, "num_tokens": 2269143.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 140.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 1.2902652025222778, "kl": 0.0826382587547414, "learning_rate": 1.3300000000000002e-06, "loss": -0.0098, "num_tokens": 2269454.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 140.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04906056076288223, "kl": 0.013733450323343277, "learning_rate": 1.3294444444444445e-06, "loss": 0.0007, "num_tokens": 2269733.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 140.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.01764458790421486, "kl": 0.0014863028627587482, "learning_rate": 1.3288888888888891e-06, "loss": 0.0001, "num_tokens": 2269952.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 140.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0225467998534441, "kl": 0.010874481871724129, "learning_rate": 1.3283333333333335e-06, "loss": 0.0005, "num_tokens": 2270264.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 140.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.05653819814324379, "kl": 0.010226999409496784, "learning_rate": 1.3277777777777778e-06, "loss": 0.0005, "num_tokens": 2270536.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 140.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.024637263268232346, "kl": 0.0060175086837261915, "learning_rate": 1.3272222222222222e-06, "loss": 0.0003, "num_tokens": 2270835.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 140.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.015373572707176208, "kl": 0.0011276879522483796, "learning_rate": 1.3266666666666667e-06, "loss": 0.0001, "num_tokens": 2271070.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.039405450224876404, "kl": 0.04565204121172428, "learning_rate": 1.3261111111111113e-06, "loss": 0.0023, "num_tokens": 2271346.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 141.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.031095199286937714, "kl": 0.005256211734376848, "learning_rate": 1.3255555555555557e-06, "loss": 0.0002, "num_tokens": 2271655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 141.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.642552137374878, "kl": 0.09091750904917717, "learning_rate": 1.3250000000000002e-06, "loss": 0.1562, "num_tokens": 2271997.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 141.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.006537110544741154, "kl": 0.0005426555871963501, "learning_rate": 1.3244444444444446e-06, "loss": 0.0, "num_tokens": 2272209.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 141.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.054067421704530716, "kl": 0.007097560912370682, "learning_rate": 1.323888888888889e-06, "loss": 0.0004, "num_tokens": 2272469.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 141.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.1479589194059372, "kl": 0.01305995974689722, "learning_rate": 1.3233333333333335e-06, "loss": 0.0006, "num_tokens": 2272712.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 141.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.6872127056121826, "kl": 0.10643204720690846, "learning_rate": 1.3227777777777779e-06, "loss": 0.0087, "num_tokens": 2273012.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 141.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.004748884122818708, "kl": 0.00015523433830821887, "learning_rate": 1.3222222222222222e-06, "loss": 0.0, "num_tokens": 2273268.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 141.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.007737734355032444, "kl": 0.0007074148743413389, "learning_rate": 1.3216666666666666e-06, "loss": 0.0, "num_tokens": 2273503.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 141.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.625048875808716, "kl": 0.09168891981244087, "learning_rate": 1.3211111111111114e-06, "loss": 0.1269, "num_tokens": 2273851.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 141.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04582186043262482, "kl": 0.02753307204693556, "learning_rate": 1.3205555555555557e-06, "loss": 0.0014, "num_tokens": 2274143.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 141.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09265636652708054, "kl": 0.10772186145186424, "learning_rate": 1.32e-06, "loss": 0.0048, "num_tokens": 2274481.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 141.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.14246323704719543, "kl": 0.15732145309448242, "learning_rate": 1.3194444444444446e-06, "loss": 0.0076, "num_tokens": 2274784.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.021073808893561363, "kl": 0.04559933394193649, "learning_rate": 1.318888888888889e-06, "loss": 0.0023, "num_tokens": 2275060.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 141.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05414974316954613, "kl": 0.022171523422002792, "learning_rate": 1.3183333333333333e-06, "loss": 0.0011, "num_tokens": 2275359.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 141.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07464660704135895, "kl": 0.024838660145178437, "learning_rate": 1.317777777777778e-06, "loss": 0.0012, "num_tokens": 2275649.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 141.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07617384940385818, "kl": 0.11837596818804741, "learning_rate": 1.3172222222222223e-06, "loss": 0.0058, "num_tokens": 2276013.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 141.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.011737214401364326, "kl": 0.0003254387993365526, "learning_rate": 1.3166666666666666e-06, "loss": 0.0, "num_tokens": 2276235.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 141.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.006077601574361324, "kl": 0.005791260860860348, "learning_rate": 1.3161111111111114e-06, "loss": 0.0003, "num_tokens": 2276547.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 141.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10023076832294464, "kl": 0.01477342564612627, "learning_rate": 1.3155555555555558e-06, "loss": 0.0009, "num_tokens": 2276811.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 141.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.4622459411621094, "kl": 0.00918570184148848, "learning_rate": 1.3150000000000001e-06, "loss": 0.0479, "num_tokens": 2277144.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 141.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03392545133829117, "kl": 0.006293905200436711, "learning_rate": 1.3144444444444447e-06, "loss": 0.0003, "num_tokens": 2277414.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 75.25, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 141.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 1.947164535522461, "kl": 0.33921676129102707, "learning_rate": 1.313888888888889e-06, "loss": 0.4581, "num_tokens": 2277939.0, "reward": 5.800000190734863, "reward_std": 4.400000095367432, "rewards/reward_combined/mean": 5.800000190734863, "rewards/reward_combined/std": 4.400000095367432, "step": 7636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 141.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.020823121070861816, "kl": 0.0033616446889936924, "learning_rate": 1.3133333333333334e-06, "loss": 0.0002, "num_tokens": 2278227.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 141.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.08258333057165146, "kl": 0.019900713581591845, "learning_rate": 1.3127777777777777e-06, "loss": 0.001, "num_tokens": 2278564.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 141.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0282336063683033, "kl": 0.009856675751507282, "learning_rate": 1.3122222222222223e-06, "loss": 0.0005, "num_tokens": 2278882.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 141.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.031599558889865875, "kl": 0.07938207313418388, "learning_rate": 1.3116666666666667e-06, "loss": 0.004, "num_tokens": 2279318.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 141.5, "frac_reward_zero_std": 1.0, "grad_norm": 2.1087560653686523, "kl": 0.2255753893405199, "learning_rate": 1.3111111111111112e-06, "loss": 0.0149, "num_tokens": 2279605.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 141.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08689375221729279, "kl": 0.025720891659148037, "learning_rate": 1.3105555555555558e-06, "loss": 0.0013, "num_tokens": 2279901.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 141.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.3153538703918457, "kl": 0.07003742456436157, "learning_rate": 1.3100000000000002e-06, "loss": -0.0579, "num_tokens": 2280372.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 7643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 141.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.06251082569360733, "kl": 0.007264623302035034, "learning_rate": 1.3094444444444445e-06, "loss": 0.0004, "num_tokens": 2280656.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 141.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.14359450340270996, "kl": 0.06011972948908806, "learning_rate": 1.308888888888889e-06, "loss": 0.003, "num_tokens": 2280954.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 141.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.007926175370812416, "kl": 0.0014568130718544126, "learning_rate": 1.3083333333333334e-06, "loss": 0.0001, "num_tokens": 2281272.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7646 }, { "clip_ratio/high_max": 0.009259259328246117, "clip_ratio/high_mean": 0.009259259328246117, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009259259328246117, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 141.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.730757236480713, "kl": 0.11034990847110748, "learning_rate": 1.3077777777777778e-06, "loss": -0.0806, "num_tokens": 2281593.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 141.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.013385854661464691, "kl": 0.0041402727365493774, "learning_rate": 1.3072222222222222e-06, "loss": 0.0002, "num_tokens": 2281801.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04053223133087158, "kl": 0.008422242011874914, "learning_rate": 1.3066666666666667e-06, "loss": 0.0004, "num_tokens": 2282073.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 141.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884156912565231, "kl": 0.01297412981512025, "learning_rate": 1.3061111111111113e-06, "loss": 0.0007, "num_tokens": 2282337.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 141.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.8581167459487915, "kl": 0.08947846852242947, "learning_rate": 1.3055555555555556e-06, "loss": 0.0047, "num_tokens": 2282670.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 141.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.3016635477542877, "kl": 0.032363214530050755, "learning_rate": 1.3050000000000002e-06, "loss": 0.0017, "num_tokens": 2282960.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 141.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031358080450445414, "kl": 0.0002901703119277954, "learning_rate": 1.3044444444444446e-06, "loss": 0.0, "num_tokens": 2283172.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 141.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.6037356853485107, "kl": 0.07308314554393291, "learning_rate": 1.303888888888889e-06, "loss": 0.0057, "num_tokens": 2283452.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 141.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.08070283383131027, "kl": 0.2190248966217041, "learning_rate": 1.3033333333333335e-06, "loss": 0.0113, "num_tokens": 2283778.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 141.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 1.1895170211791992, "kl": 0.35639001429080963, "learning_rate": 1.3027777777777778e-06, "loss": 0.0192, "num_tokens": 2284086.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 141.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.7637391090393066, "kl": 0.07666476257145405, "learning_rate": 1.3022222222222222e-06, "loss": -0.0808, "num_tokens": 2284445.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 7657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 141.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.006532583385705948, "kl": 0.0009582689381204545, "learning_rate": 1.301666666666667e-06, "loss": 0.0001, "num_tokens": 2284713.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 141.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.11345140635967255, "kl": 0.03197849867865443, "learning_rate": 1.3011111111111113e-06, "loss": 0.0018, "num_tokens": 2285017.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 88.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 88.75, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 141.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.2643719911575317, "kl": 0.042686134576797485, "learning_rate": 1.3005555555555557e-06, "loss": 0.3951, "num_tokens": 2285596.0, "reward": 5.550000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 5.550000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 7660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 141.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.004697282798588276, "kl": 0.010139696300029755, "learning_rate": 1.3e-06, "loss": 0.0005, "num_tokens": 2285832.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 141.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.2639811038970947, "kl": 0.09879434108734131, "learning_rate": 1.2994444444444446e-06, "loss": -0.0953, "num_tokens": 2286206.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 7662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 141.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.015881534665822983, "kl": 0.06238299794495106, "learning_rate": 1.298888888888889e-06, "loss": 0.0031, "num_tokens": 2286575.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 141.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0433138832449913, "kl": 0.2375849336385727, "learning_rate": 1.2983333333333333e-06, "loss": 0.0118, "num_tokens": 2286876.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 141.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03165007755160332, "kl": 0.014039484318345785, "learning_rate": 1.2977777777777779e-06, "loss": 0.0007, "num_tokens": 2287177.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 141.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04049559310078621, "kl": 0.028758302330970764, "learning_rate": 1.2972222222222222e-06, "loss": 0.0014, "num_tokens": 2287393.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 141.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001545412524137646, "kl": 1.9334256649017334e-05, "learning_rate": 1.2966666666666668e-06, "loss": 0.0, "num_tokens": 2287613.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.017094464972615242, "kl": 0.004081830149516463, "learning_rate": 1.2961111111111114e-06, "loss": 0.0002, "num_tokens": 2287917.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 142.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 6.2544426918029785, "kl": 0.5609347745776176, "learning_rate": 1.2955555555555557e-06, "loss": -0.0521, "num_tokens": 2288219.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 142.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01005103811621666, "kl": 0.2511559948325157, "learning_rate": 1.295e-06, "loss": 0.0125, "num_tokens": 2288517.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 142.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.057873427867889404, "kl": 0.016501666978001595, "learning_rate": 1.2944444444444447e-06, "loss": 0.0008, "num_tokens": 2288815.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.061516620218753815, "kl": 0.007203276734799147, "learning_rate": 1.293888888888889e-06, "loss": 0.0004, "num_tokens": 2289099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 142.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.014188166707754135, "kl": 0.0009839102567639202, "learning_rate": 1.2933333333333334e-06, "loss": 0.0001, "num_tokens": 2289365.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 142.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.07046015560626984, "kl": 0.043128807097673416, "learning_rate": 1.2927777777777777e-06, "loss": 0.0024, "num_tokens": 2289739.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 142.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 1.6716974973678589, "kl": 0.024918014591094106, "learning_rate": 1.2922222222222223e-06, "loss": 0.0128, "num_tokens": 2290052.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 142.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08294758945703506, "kl": 0.11493811011314392, "learning_rate": 1.2916666666666669e-06, "loss": 0.0058, "num_tokens": 2290364.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 142.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.022397270426154137, "kl": 0.010860130190849304, "learning_rate": 1.2911111111111112e-06, "loss": 0.0005, "num_tokens": 2290676.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.009348442777991295, "kl": 0.002631434821523726, "learning_rate": 1.2905555555555558e-06, "loss": 0.0001, "num_tokens": 2290963.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 142.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.12823514640331268, "kl": 0.024731284007430077, "learning_rate": 1.2900000000000001e-06, "loss": 0.0012, "num_tokens": 2291293.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015891647490207106, "kl": 2.3342669010162354e-05, "learning_rate": 1.2894444444444445e-06, "loss": 0.0, "num_tokens": 2291513.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 142.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041620307601988316, "kl": 0.0003792672068811953, "learning_rate": 1.288888888888889e-06, "loss": 0.0, "num_tokens": 2291747.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 142.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.010758550837635994, "kl": 0.0021884377929382026, "learning_rate": 1.2883333333333334e-06, "loss": 0.0001, "num_tokens": 2292035.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 142.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040343874134123325, "kl": 0.0014009362203069031, "learning_rate": 1.2877777777777778e-06, "loss": 0.0001, "num_tokens": 2292353.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10244589298963547, "kl": 0.029901951551437378, "learning_rate": 1.2872222222222221e-06, "loss": 0.0015, "num_tokens": 2292572.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 142.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.011517862789332867, "kl": 0.00032354889845009893, "learning_rate": 1.286666666666667e-06, "loss": 0.0, "num_tokens": 2292828.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01676338165998459, "kl": 0.00039349496364593506, "learning_rate": 1.2861111111111113e-06, "loss": 0.0, "num_tokens": 2293040.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 142.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.3201864957809448, "kl": 0.09826215356588364, "learning_rate": 1.2855555555555556e-06, "loss": 0.0049, "num_tokens": 2293401.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 142.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.033600836992263794, "kl": 0.023984426632523537, "learning_rate": 1.2850000000000002e-06, "loss": 0.0012, "num_tokens": 2293766.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 142.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.030478399246931076, "kl": 0.004870030330494046, "learning_rate": 1.2844444444444445e-06, "loss": 0.0003, "num_tokens": 2294036.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 142.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.11916782706975937, "kl": 0.1375107690691948, "learning_rate": 1.283888888888889e-06, "loss": 0.0069, "num_tokens": 2294382.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 142.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.003741892985999584, "kl": 0.0030745677649974823, "learning_rate": 1.2833333333333335e-06, "loss": 0.0002, "num_tokens": 2294642.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.04715866968035698, "kl": 0.01734296604990959, "learning_rate": 1.2827777777777778e-06, "loss": 0.0009, "num_tokens": 2294917.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03689424321055412, "kl": 0.012976877856999636, "learning_rate": 1.2822222222222222e-06, "loss": 0.0006, "num_tokens": 2295193.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006410256493836641, "clip_ratio/low_min": 0.006410256493836641, "clip_ratio/region_mean": 0.006410256493836641, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 142.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.7545738220214844, "kl": 0.1013244241476059, "learning_rate": 1.281666666666667e-06, "loss": -0.0761, "num_tokens": 2295585.0, "reward": 2.625, "reward_std": 3.5910768508911133, "rewards/reward_combined/mean": 2.625, "rewards/reward_combined/std": 3.5910770893096924, "step": 7694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 142.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.8335185050964355, "kl": 0.22051145136356354, "learning_rate": 1.2811111111111113e-06, "loss": 0.1607, "num_tokens": 2295901.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 7695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005082935560494661, "kl": 0.010075360536575317, "learning_rate": 1.2805555555555557e-06, "loss": 0.0005, "num_tokens": 2296137.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 142.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.4368534088134766, "kl": 0.2605900407070294, "learning_rate": 1.28e-06, "loss": 0.1584, "num_tokens": 2296401.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.026369299739599228, "kl": 0.004486419027671218, "learning_rate": 1.2794444444444446e-06, "loss": 0.0002, "num_tokens": 2296701.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 142.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.026574959978461266, "kl": 0.00790390931069851, "learning_rate": 1.278888888888889e-06, "loss": 0.0004, "num_tokens": 2296974.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 142.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.26850569248199463, "kl": 0.05163013283163309, "learning_rate": 1.2783333333333333e-06, "loss": 0.0026, "num_tokens": 2297251.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 142.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04323847219347954, "kl": 0.04566331394016743, "learning_rate": 1.2777777777777779e-06, "loss": 0.0023, "num_tokens": 2297527.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 142.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.08061687648296356, "kl": 0.019028152339160442, "learning_rate": 1.2772222222222222e-06, "loss": 0.001, "num_tokens": 2297787.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 142.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.006680488586426, "kl": 0.16316690295934677, "learning_rate": 1.2766666666666668e-06, "loss": 0.0225, "num_tokens": 2298122.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.014141939580440521, "kl": 0.0010417461162433028, "learning_rate": 1.2761111111111114e-06, "loss": 0.0, "num_tokens": 2298376.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 142.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04617302492260933, "kl": 0.026086601428687572, "learning_rate": 1.2755555555555557e-06, "loss": 0.0013, "num_tokens": 2298715.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 142.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005635038949549198, "kl": 0.00185910688014701, "learning_rate": 1.275e-06, "loss": 0.0001, "num_tokens": 2298997.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 142.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.0049755573272705, "kl": 0.24708328396081924, "learning_rate": 1.2744444444444446e-06, "loss": 0.0153, "num_tokens": 2299323.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 7707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 142.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.022212766110897064, "kl": 0.01575500052422285, "learning_rate": 1.273888888888889e-06, "loss": 0.0008, "num_tokens": 2299639.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 142.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037394771352410316, "kl": 0.2819572240114212, "learning_rate": 1.2733333333333334e-06, "loss": 0.0141, "num_tokens": 2299927.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 142.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.08173829317092896, "kl": 0.01892545446753502, "learning_rate": 1.2727777777777777e-06, "loss": 0.0009, "num_tokens": 2300261.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 142.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.006786602083593607, "kl": 0.0006982982158660889, "learning_rate": 1.2722222222222223e-06, "loss": 0.0, "num_tokens": 2300473.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 142.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 9.112221717834473, "kl": 0.012387282215058804, "learning_rate": 1.2716666666666668e-06, "loss": 0.1781, "num_tokens": 2300698.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 142.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.9227463006973267, "kl": 0.1647617109119892, "learning_rate": 1.2711111111111112e-06, "loss": -0.0067, "num_tokens": 2300989.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 142.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.6638795137405396, "kl": 0.15343919396400452, "learning_rate": 1.2705555555555558e-06, "loss": 0.0085, "num_tokens": 2301348.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 142.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0749397948384285, "kl": 0.006465860642492771, "learning_rate": 1.2700000000000001e-06, "loss": 0.0003, "num_tokens": 2301591.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 142.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 19.469655990600586, "kl": 0.13673235476016998, "learning_rate": 1.2694444444444445e-06, "loss": 0.0061, "num_tokens": 2301803.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 7716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 142.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.06735656410455704, "kl": 0.4304622858762741, "learning_rate": 1.268888888888889e-06, "loss": 0.0215, "num_tokens": 2302087.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 142.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.033822536468506, "kl": 0.07185172289609909, "learning_rate": 1.2683333333333334e-06, "loss": -0.0112, "num_tokens": 2302380.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 142.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 0.39661937952041626, "kl": 0.11012735590338707, "learning_rate": 1.2677777777777778e-06, "loss": 0.0051, "num_tokens": 2302828.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 55.75, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 142.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.2546188831329346, "kl": 0.27921103686094284, "learning_rate": 1.2672222222222225e-06, "loss": -0.0066, "num_tokens": 2303331.0, "reward": 1.7999999523162842, "reward_std": 2.3999998569488525, "rewards/reward_combined/mean": 1.7999999523162842, "rewards/reward_combined/std": 2.3999998569488525, "step": 7720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 142.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.049283917993307114, "kl": 0.004220618633553386, "learning_rate": 1.2666666666666669e-06, "loss": 0.0002, "num_tokens": 2303627.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 143.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09469142556190491, "kl": 0.017639169469475746, "learning_rate": 1.2661111111111112e-06, "loss": 0.0009, "num_tokens": 2303963.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 143.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.024570465087891, "kl": 0.1437519162427634, "learning_rate": 1.2655555555555556e-06, "loss": 0.0558, "num_tokens": 2304238.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 143.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001571804314153269, "kl": 2.0422041416168213e-05, "learning_rate": 1.2650000000000002e-06, "loss": 0.0, "num_tokens": 2304458.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 143.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0762934684753418, "kl": 0.014605706091970205, "learning_rate": 1.2644444444444445e-06, "loss": 0.0008, "num_tokens": 2304789.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 143.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04251650348305702, "kl": 0.0038191387429833412, "learning_rate": 1.2638888888888889e-06, "loss": 0.0002, "num_tokens": 2305038.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 143.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.009880401194095612, "kl": 0.0005590142391156405, "learning_rate": 1.2633333333333334e-06, "loss": 0.0, "num_tokens": 2305273.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 143.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1049741804599762, "kl": 0.030291395261883736, "learning_rate": 1.2627777777777778e-06, "loss": 0.0015, "num_tokens": 2305492.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7728 }, { "clip_ratio/high_max": 0.0071428571827709675, "clip_ratio/high_mean": 0.0071428571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0071428571827709675, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 143.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 1.228808879852295, "kl": 0.14297305047512054, "learning_rate": 1.2622222222222224e-06, "loss": -0.1054, "num_tokens": 2305854.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 143.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01053815707564354, "kl": 0.039466364309191704, "learning_rate": 1.261666666666667e-06, "loss": 0.002, "num_tokens": 2306322.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 143.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019570081494748592, "kl": 0.0017549671465530992, "learning_rate": 1.2611111111111113e-06, "loss": 0.0001, "num_tokens": 2306602.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 143.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01838877983391285, "kl": 0.004127393593080342, "learning_rate": 1.2605555555555557e-06, "loss": 0.0002, "num_tokens": 2306868.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 143.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.023919902741909027, "kl": 0.005376315442845225, "learning_rate": 1.26e-06, "loss": 0.0003, "num_tokens": 2307174.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 143.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04192677512764931, "kl": 0.030895966105163097, "learning_rate": 1.2594444444444446e-06, "loss": 0.0017, "num_tokens": 2307446.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 143.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.032767534255981445, "kl": 0.0019195274217054248, "learning_rate": 1.258888888888889e-06, "loss": 0.0001, "num_tokens": 2307694.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 143.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.09788404405117035, "kl": 0.026148106902837753, "learning_rate": 1.2583333333333333e-06, "loss": 0.0013, "num_tokens": 2307994.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 143.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.011458617635071278, "kl": 0.001647493219934404, "learning_rate": 1.2577777777777779e-06, "loss": 0.0001, "num_tokens": 2308250.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 143.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02565034292638302, "kl": 0.012130501039791852, "learning_rate": 1.2572222222222224e-06, "loss": 0.0006, "num_tokens": 2308538.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 143.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.6562094688415527, "kl": 0.2977127581834793, "learning_rate": 1.2566666666666668e-06, "loss": 0.0603, "num_tokens": 2308918.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 143.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.052721232175827026, "kl": 0.03154151188209653, "learning_rate": 1.2561111111111113e-06, "loss": 0.0017, "num_tokens": 2309218.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 143.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036521288566291332, "kl": 0.2819887399673462, "learning_rate": 1.2555555555555557e-06, "loss": 0.0141, "num_tokens": 2309506.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 143.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.2984003722667694, "kl": 0.05990143306553364, "learning_rate": 1.255e-06, "loss": 0.0027, "num_tokens": 2309795.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 143.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03719792887568474, "kl": 0.013007923029363155, "learning_rate": 1.2544444444444446e-06, "loss": 0.0007, "num_tokens": 2310131.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 143.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.05202605947852135, "kl": 0.02772366814315319, "learning_rate": 1.253888888888889e-06, "loss": 0.0014, "num_tokens": 2310469.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 143.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.19645605981349945, "kl": 0.1667315661907196, "learning_rate": 1.2533333333333333e-06, "loss": 0.0082, "num_tokens": 2310768.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 143.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.01763930916786194, "kl": 0.002056039869785309, "learning_rate": 1.2527777777777777e-06, "loss": 0.0001, "num_tokens": 2310974.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 143.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.290856838226318, "kl": 0.4765743017196655, "learning_rate": 1.2522222222222225e-06, "loss": -0.0299, "num_tokens": 2311255.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 143.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.048773735761642456, "kl": 0.030540240928530693, "learning_rate": 1.2516666666666668e-06, "loss": 0.0016, "num_tokens": 2311548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 143.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03419240191578865, "kl": 0.009726963937282562, "learning_rate": 1.2511111111111112e-06, "loss": 0.0005, "num_tokens": 2311821.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 143.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.006864841096103191, "kl": 0.0006916821002960205, "learning_rate": 1.2505555555555557e-06, "loss": 0.0, "num_tokens": 2312033.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 143.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 8.996759414672852, "kl": 0.04527485556900501, "learning_rate": 1.25e-06, "loss": -0.201, "num_tokens": 2312306.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 7751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 143.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.009892399422824383, "kl": 0.003732013748958707, "learning_rate": 1.2494444444444445e-06, "loss": 0.0002, "num_tokens": 2312610.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 143.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.07126224040985107, "kl": 0.017489000223577023, "learning_rate": 1.248888888888889e-06, "loss": 0.0008, "num_tokens": 2312891.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 143.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.08180562406778336, "kl": 0.13937469571828842, "learning_rate": 1.2483333333333334e-06, "loss": 0.007, "num_tokens": 2313223.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 143.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04799933731555939, "kl": 0.012461441569030285, "learning_rate": 1.247777777777778e-06, "loss": 0.0006, "num_tokens": 2313541.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 143.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03338281810283661, "kl": 0.16680384427309036, "learning_rate": 1.2472222222222223e-06, "loss": 0.0083, "num_tokens": 2313853.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 143.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.5342533588409424, "kl": 0.09066282212734222, "learning_rate": 1.2466666666666667e-06, "loss": -0.2862, "num_tokens": 2314220.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 7757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 143.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.057160280644893646, "kl": 0.008882285095751286, "learning_rate": 1.2461111111111112e-06, "loss": 0.0004, "num_tokens": 2314526.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 143.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07891787588596344, "kl": 0.02702695084735751, "learning_rate": 1.2455555555555556e-06, "loss": 0.0014, "num_tokens": 2314843.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 143.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02307536080479622, "kl": 0.010753000155091286, "learning_rate": 1.2450000000000002e-06, "loss": 0.0005, "num_tokens": 2315155.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 143.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 4.8860249519348145, "kl": 0.04320300184190273, "learning_rate": 1.2444444444444445e-06, "loss": 0.1409, "num_tokens": 2315489.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 143.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 4.659634590148926, "kl": 0.14012153446674347, "learning_rate": 1.2438888888888889e-06, "loss": 0.0535, "num_tokens": 2315703.0, "reward": 2.875, "reward_std": 1.25, "rewards/reward_combined/mean": 2.875, "rewards/reward_combined/std": 1.25, "step": 7762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 143.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.008222214877605438, "kl": 0.00046371221833396703, "learning_rate": 1.2433333333333334e-06, "loss": 0.0, "num_tokens": 2315959.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 143.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.4898417890071869, "kl": 0.11348188668489456, "learning_rate": 1.2427777777777778e-06, "loss": 0.006, "num_tokens": 2316261.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 143.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.19637441635131836, "kl": 0.038235537707805634, "learning_rate": 1.2422222222222224e-06, "loss": 0.0024, "num_tokens": 2316528.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 143.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.087103933095932, "kl": 0.04916643165051937, "learning_rate": 1.2416666666666667e-06, "loss": 0.0025, "num_tokens": 2316851.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 143.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.4976067543029785, "kl": 0.8285684213042259, "learning_rate": 1.2411111111111113e-06, "loss": -0.0314, "num_tokens": 2317239.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 143.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06546837091445923, "kl": 0.023541256203316152, "learning_rate": 1.2405555555555556e-06, "loss": 0.0012, "num_tokens": 2317523.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 143.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03248859569430351, "kl": 0.008095062104985118, "learning_rate": 1.2400000000000002e-06, "loss": 0.0004, "num_tokens": 2317811.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 143.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06992264091968536, "kl": 0.0030780442466493696, "learning_rate": 1.2394444444444446e-06, "loss": 0.0002, "num_tokens": 2318033.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 143.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.013121463358402252, "kl": 0.23904497176408768, "learning_rate": 1.2388888888888891e-06, "loss": 0.0119, "num_tokens": 2318333.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 143.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.004706956911832094, "kl": 0.010204583406448364, "learning_rate": 1.2383333333333335e-06, "loss": 0.0005, "num_tokens": 2318569.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 143.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.008875647559762001, "kl": 0.0014393283054232597, "learning_rate": 1.2377777777777778e-06, "loss": 0.0001, "num_tokens": 2318891.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7773 }, { "clip_ratio/high_max": 0.006849315017461777, "clip_ratio/high_mean": 0.006849315017461777, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006849315017461777, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 143.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.4908117055892944, "kl": 0.13989128544926643, "learning_rate": 1.2372222222222224e-06, "loss": -0.0288, "num_tokens": 2319254.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 7774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 143.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02555859088897705, "kl": 0.0015862129512242973, "learning_rate": 1.2366666666666668e-06, "loss": 0.0001, "num_tokens": 2319516.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 144.0, "frac_reward_zero_std": 1.0, "grad_norm": 1.1539174318313599, "kl": 0.31173014640808105, "learning_rate": 1.2361111111111113e-06, "loss": 0.0147, "num_tokens": 2319865.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 144.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.12241224199533463, "kl": 0.04403468733653426, "learning_rate": 1.2355555555555557e-06, "loss": 0.0022, "num_tokens": 2320155.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047096023336052895, "kl": 0.010212890803813934, "learning_rate": 1.235e-06, "loss": 0.0005, "num_tokens": 2320391.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 144.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.07406233996152878, "kl": 0.029468816297594458, "learning_rate": 1.2344444444444446e-06, "loss": 0.0016, "num_tokens": 2320689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015727906429674476, "kl": 1.9960105419158936e-05, "learning_rate": 1.233888888888889e-06, "loss": 0.0, "num_tokens": 2320909.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 144.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.007069607265293598, "kl": 0.003107115626335144, "learning_rate": 1.2333333333333335e-06, "loss": 0.0002, "num_tokens": 2321169.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 144.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.013932062312960625, "kl": 0.008466188795864582, "learning_rate": 1.2327777777777779e-06, "loss": 0.0004, "num_tokens": 2321481.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 144.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.01753462664783001, "kl": 0.07186786830425262, "learning_rate": 1.2322222222222222e-06, "loss": 0.0036, "num_tokens": 2321925.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 144.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.18485191464424133, "kl": 0.03117824997752905, "learning_rate": 1.2316666666666668e-06, "loss": 0.0017, "num_tokens": 2322192.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 144.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.288323402404785, "kl": 0.13950182497501373, "learning_rate": 1.2311111111111112e-06, "loss": 0.1397, "num_tokens": 2322497.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 7785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 144.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.036040760576725006, "kl": 0.0017582178115844727, "learning_rate": 1.2305555555555557e-06, "loss": 0.0001, "num_tokens": 2322709.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 144.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.030701620504260063, "kl": 0.00797932269051671, "learning_rate": 1.23e-06, "loss": 0.0004, "num_tokens": 2323008.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 144.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.184863328933716, "kl": 0.19378668814897537, "learning_rate": 1.2294444444444444e-06, "loss": 0.0998, "num_tokens": 2323369.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 7788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 144.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.01785251311957836, "kl": 0.0017679441953077912, "learning_rate": 1.228888888888889e-06, "loss": 0.0001, "num_tokens": 2323633.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 144.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.06130625680088997, "kl": 0.06263968534767628, "learning_rate": 1.2283333333333334e-06, "loss": 0.0027, "num_tokens": 2323961.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.05483892560005188, "kl": 0.028039975091814995, "learning_rate": 1.227777777777778e-06, "loss": 0.0014, "num_tokens": 2324180.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 144.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.021897615864872932, "kl": 0.00370887981262058, "learning_rate": 1.2272222222222223e-06, "loss": 0.0002, "num_tokens": 2324468.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 144.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03917797654867172, "kl": 0.012613729573786259, "learning_rate": 1.2266666666666666e-06, "loss": 0.0006, "num_tokens": 2324744.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 144.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02936685085296631, "kl": 0.025527584832161665, "learning_rate": 1.2261111111111112e-06, "loss": 0.0013, "num_tokens": 2325142.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 144.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.12888331711292267, "kl": 0.04230451211333275, "learning_rate": 1.2255555555555556e-06, "loss": 0.0021, "num_tokens": 2325496.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 144.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.21880359947681427, "kl": 0.03271249867975712, "learning_rate": 1.2250000000000001e-06, "loss": 0.0018, "num_tokens": 2325779.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 144.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005214527249336, "kl": 0.07639796659350395, "learning_rate": 1.2244444444444445e-06, "loss": 0.0038, "num_tokens": 2326143.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 144.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.025771761313080788, "kl": 0.005544960964471102, "learning_rate": 1.223888888888889e-06, "loss": 0.0003, "num_tokens": 2326449.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 144.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.1973769664764404, "kl": 0.17541904002428055, "learning_rate": 1.2233333333333334e-06, "loss": 0.1275, "num_tokens": 2326801.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 144.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.009279423393309116, "kl": 0.0006918665749253705, "learning_rate": 1.2227777777777778e-06, "loss": 0.0, "num_tokens": 2327035.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.13113263249397278, "kl": 0.00951986014842987, "learning_rate": 1.2222222222222223e-06, "loss": 0.0005, "num_tokens": 2327247.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 144.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04466110095381737, "kl": 0.011773890350013971, "learning_rate": 1.221666666666667e-06, "loss": 0.0006, "num_tokens": 2327565.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 144.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.13376040756702423, "kl": 0.0267796590924263, "learning_rate": 1.2211111111111113e-06, "loss": 0.0014, "num_tokens": 2327837.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 144.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03470237925648689, "kl": 0.1749926432967186, "learning_rate": 1.2205555555555556e-06, "loss": 0.0087, "num_tokens": 2328147.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 144.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.024018803611397743, "kl": 0.001486402004957199, "learning_rate": 1.2200000000000002e-06, "loss": 0.0001, "num_tokens": 2328405.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 144.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.013140937313437462, "kl": 0.0016355154803022742, "learning_rate": 1.2194444444444445e-06, "loss": 0.0001, "num_tokens": 2328722.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 144.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03010704554617405, "kl": 0.014909346587955952, "learning_rate": 1.218888888888889e-06, "loss": 0.0007, "num_tokens": 2328990.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 144.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 8.760660171508789, "kl": 0.07782384194433689, "learning_rate": 1.2183333333333335e-06, "loss": 0.1871, "num_tokens": 2329260.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 7808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 144.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 3.3685624599456787, "kl": 0.09175347187556326, "learning_rate": 1.2177777777777778e-06, "loss": 0.0359, "num_tokens": 2329552.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 144.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0413917601108551, "kl": 0.0975726991891861, "learning_rate": 1.2172222222222224e-06, "loss": 0.0051, "num_tokens": 2329896.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 144.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.028446726500988007, "kl": 0.012509326450526714, "learning_rate": 1.2166666666666667e-06, "loss": 0.0006, "num_tokens": 2330232.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 144.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.05480636656284332, "kl": 0.05033087357878685, "learning_rate": 1.2161111111111113e-06, "loss": 0.0025, "num_tokens": 2330701.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 144.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.11048226803541183, "kl": 0.020982240326702595, "learning_rate": 1.2155555555555557e-06, "loss": 0.0011, "num_tokens": 2331027.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 144.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.5574991703033447, "kl": 0.07570740580558777, "learning_rate": 1.215e-06, "loss": -0.0663, "num_tokens": 2331426.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 7814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 144.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.14033208787441254, "kl": 0.033725653775036335, "learning_rate": 1.2144444444444446e-06, "loss": 0.0017, "num_tokens": 2331717.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 144.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.12279761582612991, "kl": 0.17218729108572006, "learning_rate": 1.213888888888889e-06, "loss": 0.0086, "num_tokens": 2332001.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 144.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 5.320235252380371, "kl": 0.4474906772375107, "learning_rate": 1.2133333333333335e-06, "loss": 0.0037, "num_tokens": 2332284.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 144.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.16111023724079132, "kl": 0.04559926176443696, "learning_rate": 1.2127777777777779e-06, "loss": 0.0023, "num_tokens": 2332624.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.021197615191340446, "kl": 0.0009641826036386192, "learning_rate": 1.2122222222222222e-06, "loss": 0.0, "num_tokens": 2332843.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 144.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013280133716762066, "kl": 0.001734871999360621, "learning_rate": 1.2116666666666668e-06, "loss": 0.0001, "num_tokens": 2333123.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 144.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.022694600746035576, "kl": 0.002140758209861815, "learning_rate": 1.2111111111111111e-06, "loss": 0.0001, "num_tokens": 2333437.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 144.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.15744490921497345, "kl": 0.029042761772871017, "learning_rate": 1.2105555555555557e-06, "loss": 0.0015, "num_tokens": 2333714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 144.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.002712731482461095, "kl": 0.2823324203491211, "learning_rate": 1.21e-06, "loss": 0.0141, "num_tokens": 2334002.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 144.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04221174120903015, "kl": 0.010263033211231232, "learning_rate": 1.2094444444444444e-06, "loss": 0.0005, "num_tokens": 2334332.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 144.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03874106705188751, "kl": 0.0018677911721169949, "learning_rate": 1.208888888888889e-06, "loss": 0.0001, "num_tokens": 2334593.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 144.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.006597644183784723, "kl": 0.0002221882386947982, "learning_rate": 1.2083333333333333e-06, "loss": 0.0, "num_tokens": 2334849.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 144.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.09237083047628403, "kl": 0.006637983024120331, "learning_rate": 1.207777777777778e-06, "loss": 0.0003, "num_tokens": 2335092.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 144.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03951730951666832, "kl": 0.0015417889226227999, "learning_rate": 1.2072222222222223e-06, "loss": 0.0001, "num_tokens": 2335305.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 144.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.026753660291433334, "kl": 0.029781918972730637, "learning_rate": 1.2066666666666668e-06, "loss": 0.0015, "num_tokens": 2335596.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 145.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.016778435558080673, "kl": 0.2518605440855026, "learning_rate": 1.2061111111111112e-06, "loss": 0.0126, "num_tokens": 2335894.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 145.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.006109800189733505, "kl": 0.002066858403850347, "learning_rate": 1.2055555555555555e-06, "loss": 0.0001, "num_tokens": 2336178.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 145.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.011453063227236271, "kl": 0.22662628442049026, "learning_rate": 1.2050000000000001e-06, "loss": 0.0113, "num_tokens": 2336480.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 145.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.04163024201989174, "kl": 0.06256147101521492, "learning_rate": 1.2044444444444447e-06, "loss": 0.0033, "num_tokens": 2336758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 145.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 7.5931172370910645, "kl": 0.01017645507818088, "learning_rate": 1.203888888888889e-06, "loss": 0.0108, "num_tokens": 2336992.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 7834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 145.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.021200526505708694, "kl": 0.011437651701271534, "learning_rate": 1.2033333333333334e-06, "loss": 0.0006, "num_tokens": 2337308.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 145.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02899792790412903, "kl": 0.009780819527804852, "learning_rate": 1.2027777777777777e-06, "loss": 0.0005, "num_tokens": 2337631.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 145.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.6070953607559204, "kl": 0.20304502546787262, "learning_rate": 1.2022222222222223e-06, "loss": 0.0107, "num_tokens": 2338005.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 145.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.007087125442922115, "kl": 0.0007588863372802734, "learning_rate": 1.2016666666666669e-06, "loss": 0.0, "num_tokens": 2338217.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 145.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.04288534075021744, "kl": 0.02122388780117035, "learning_rate": 1.2011111111111112e-06, "loss": 0.0011, "num_tokens": 2338503.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 145.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 4.003042697906494, "kl": 0.4393618553876877, "learning_rate": 1.2005555555555556e-06, "loss": 0.0568, "num_tokens": 2338789.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7840 }, { "clip_ratio/high_max": 0.012636349885724485, "clip_ratio/high_mean": 0.012636349885724485, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012636349885724485, "completion_length": 82.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 82.25, "completions/mean_terminated_length": 24.33333396911621, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 145.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.468057632446289, "kl": 0.18596934527158737, "learning_rate": 1.2000000000000002e-06, "loss": 0.4084, "num_tokens": 2339338.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 145.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07165522873401642, "kl": 0.004172152490355074, "learning_rate": 1.1994444444444445e-06, "loss": 0.0002, "num_tokens": 2339593.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 145.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.1332596391439438, "kl": 0.05916494503617287, "learning_rate": 1.198888888888889e-06, "loss": 0.0029, "num_tokens": 2340061.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 145.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04901619255542755, "kl": 0.004188838880509138, "learning_rate": 1.1983333333333334e-06, "loss": 0.0002, "num_tokens": 2340321.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 145.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.261082172393799, "kl": 0.8374345749616623, "learning_rate": 1.1977777777777778e-06, "loss": -0.0449, "num_tokens": 2340639.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 145.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.050543248653411865, "kl": 0.011981490533798933, "learning_rate": 1.1972222222222224e-06, "loss": 0.0006, "num_tokens": 2340967.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 145.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012636746396310627, "kl": 1.0438263416290283e-05, "learning_rate": 1.1966666666666667e-06, "loss": 0.0, "num_tokens": 2341187.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 145.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.23524969816207886, "kl": 0.021140985190868378, "learning_rate": 1.1961111111111113e-06, "loss": 0.0012, "num_tokens": 2341399.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 145.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10657363384962082, "kl": 0.03075014427304268, "learning_rate": 1.1955555555555556e-06, "loss": 0.0016, "num_tokens": 2341618.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.3038356900215149, "kl": 0.041147696203552186, "learning_rate": 1.195e-06, "loss": 0.0022, "num_tokens": 2341922.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 145.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006182179786264896, "kl": 0.003202434629201889, "learning_rate": 1.1944444444444446e-06, "loss": 0.0002, "num_tokens": 2342182.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 145.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 2.8245656490325928, "kl": 0.2323661968111992, "learning_rate": 1.193888888888889e-06, "loss": 0.0199, "num_tokens": 2342480.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04801587760448456, "kl": 0.01647429633885622, "learning_rate": 1.1933333333333335e-06, "loss": 0.0008, "num_tokens": 2342762.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 145.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.014050005935132504, "kl": 0.008487825281918049, "learning_rate": 1.1927777777777778e-06, "loss": 0.0004, "num_tokens": 2343074.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 145.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04025721177458763, "kl": 0.00876697339117527, "learning_rate": 1.1922222222222222e-06, "loss": 0.0004, "num_tokens": 2343346.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 145.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05755986273288727, "kl": 0.014592362567782402, "learning_rate": 1.1916666666666668e-06, "loss": 0.0007, "num_tokens": 2343680.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 145.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042000119574368, "kl": 0.01031678169965744, "learning_rate": 1.1911111111111111e-06, "loss": 0.0005, "num_tokens": 2343916.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 145.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009284182451665401, "kl": 0.00156554818386212, "learning_rate": 1.1905555555555557e-06, "loss": 0.0001, "num_tokens": 2344237.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 145.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.021441319957375526, "kl": 0.16877403110265732, "learning_rate": 1.19e-06, "loss": 0.0084, "num_tokens": 2344548.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 145.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.031220365315675735, "kl": 0.001708729367237538, "learning_rate": 1.1894444444444446e-06, "loss": 0.0001, "num_tokens": 2344810.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 145.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.16209976375102997, "kl": 0.08419189974665642, "learning_rate": 1.188888888888889e-06, "loss": 0.0041, "num_tokens": 2345127.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 145.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02539394237101078, "kl": 0.006010636920109391, "learning_rate": 1.1883333333333333e-06, "loss": 0.0003, "num_tokens": 2345434.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 145.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.20160071551799774, "kl": 0.15139350295066833, "learning_rate": 1.1877777777777779e-06, "loss": 0.0075, "num_tokens": 2345790.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 145.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026979688555002213, "kl": 0.2823800891637802, "learning_rate": 1.1872222222222225e-06, "loss": 0.0141, "num_tokens": 2346078.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 145.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.5137443542480469, "kl": 0.17306193709373474, "learning_rate": 1.1866666666666668e-06, "loss": 0.008, "num_tokens": 2346408.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 145.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.11269789934158325, "kl": 0.08732687681913376, "learning_rate": 1.1861111111111112e-06, "loss": 0.0043, "num_tokens": 2346841.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 145.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.026579709723591805, "kl": 0.004248593468219042, "learning_rate": 1.1855555555555555e-06, "loss": 0.0002, "num_tokens": 2347150.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 145.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.7345123291015625, "kl": 0.1574925146996975, "learning_rate": 1.185e-06, "loss": 0.0791, "num_tokens": 2347510.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 7868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 145.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02548835054039955, "kl": 0.0018351435428485274, "learning_rate": 1.1844444444444447e-06, "loss": 0.0001, "num_tokens": 2347768.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 145.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 1.3312069177627563, "kl": 0.05814292118884623, "learning_rate": 1.183888888888889e-06, "loss": 0.0024, "num_tokens": 2348056.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 145.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.01709570921957493, "kl": 0.002694465045351535, "learning_rate": 1.1833333333333334e-06, "loss": 0.0001, "num_tokens": 2348354.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 145.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.017442859709262848, "kl": 0.0002487674355506897, "learning_rate": 1.182777777777778e-06, "loss": 0.0, "num_tokens": 2348566.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 145.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07966433465480804, "kl": 0.01321549154818058, "learning_rate": 1.1822222222222223e-06, "loss": 0.0007, "num_tokens": 2348899.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 145.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04143744707107544, "kl": 0.056731028482317924, "learning_rate": 1.1816666666666669e-06, "loss": 0.0033, "num_tokens": 2349257.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.014708041213452816, "kl": 0.003817012649960816, "learning_rate": 1.1811111111111112e-06, "loss": 0.0002, "num_tokens": 2349541.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 145.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04196811839938164, "kl": 0.003790471935644746, "learning_rate": 1.1805555555555556e-06, "loss": 0.0002, "num_tokens": 2349790.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 145.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.22411943972110748, "kl": 0.052198925986886024, "learning_rate": 1.1800000000000001e-06, "loss": 0.0035, "num_tokens": 2350101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 145.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07820450514554977, "kl": 0.03048002067953348, "learning_rate": 1.1794444444444445e-06, "loss": 0.0017, "num_tokens": 2350491.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 145.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04144682362675667, "kl": 0.002580597996711731, "learning_rate": 1.178888888888889e-06, "loss": 0.0001, "num_tokens": 2350710.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 145.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.003612089203670621, "kl": 0.00013100206706440076, "learning_rate": 1.1783333333333334e-06, "loss": 0.0, "num_tokens": 2350966.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.04593164473772049, "kl": 0.011681171599775553, "learning_rate": 1.1777777777777778e-06, "loss": 0.0006, "num_tokens": 2351241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 145.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09040899574756622, "kl": 0.09128233417868614, "learning_rate": 1.1772222222222223e-06, "loss": 0.0046, "num_tokens": 2351632.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 145.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.028394503518939018, "kl": 0.027181693352758884, "learning_rate": 1.1766666666666667e-06, "loss": 0.0014, "num_tokens": 2351925.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 146.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.022452551871538162, "kl": 0.001984073081985116, "learning_rate": 1.1761111111111113e-06, "loss": 0.0001, "num_tokens": 2352189.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.010459593497216702, "kl": 0.0021955041447654366, "learning_rate": 1.1755555555555556e-06, "loss": 0.0001, "num_tokens": 2352466.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 146.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.186769962310791, "kl": 0.2285904437303543, "learning_rate": 1.175e-06, "loss": -0.0417, "num_tokens": 2352734.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006329114083200693, "clip_ratio/low_min": 0.006329114083200693, "clip_ratio/region_mean": 0.006329114083200693, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 146.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.195342540740967, "kl": 0.28074656426906586, "learning_rate": 1.1744444444444445e-06, "loss": 0.0926, "num_tokens": 2353149.0, "reward": 3.25, "reward_std": 5.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 5.5, "step": 7887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 146.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03980688005685806, "kl": 0.011534705758094788, "learning_rate": 1.173888888888889e-06, "loss": 0.0006, "num_tokens": 2353484.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.013537649996578693, "kl": 0.001071532373316586, "learning_rate": 1.1733333333333335e-06, "loss": 0.0001, "num_tokens": 2353749.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 146.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.2526798248291016, "kl": 0.13479456305503845, "learning_rate": 1.1727777777777778e-06, "loss": 0.0183, "num_tokens": 2354113.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 146.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.9888226985931396, "kl": 0.220783531665802, "learning_rate": 1.1722222222222224e-06, "loss": 0.0343, "num_tokens": 2354374.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 146.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.022459082305431366, "kl": 0.0017074652132578194, "learning_rate": 1.1716666666666667e-06, "loss": 0.0001, "num_tokens": 2354628.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 146.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.004629054572433233, "kl": 0.010218597948551178, "learning_rate": 1.171111111111111e-06, "loss": 0.0005, "num_tokens": 2354864.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 146.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06164275109767914, "kl": 0.05812452733516693, "learning_rate": 1.1705555555555557e-06, "loss": 0.0029, "num_tokens": 2355159.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 146.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.5228805541992188, "kl": 0.0869481973350048, "learning_rate": 1.1700000000000002e-06, "loss": 0.149, "num_tokens": 2355551.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 146.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.183351993560791, "kl": 0.1702955923974514, "learning_rate": 1.1694444444444446e-06, "loss": 0.0204, "num_tokens": 2355862.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 146.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04310184717178345, "kl": 0.0013681321870535612, "learning_rate": 1.168888888888889e-06, "loss": 0.0001, "num_tokens": 2356075.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0298955999314785, "kl": 0.002151311608031392, "learning_rate": 1.1683333333333333e-06, "loss": 0.0001, "num_tokens": 2356365.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 146.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.011317756958305836, "kl": 0.001673273742198944, "learning_rate": 1.1677777777777779e-06, "loss": 0.0001, "num_tokens": 2356571.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 146.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.09489888697862625, "kl": 0.03519434109330177, "learning_rate": 1.1672222222222224e-06, "loss": 0.0013, "num_tokens": 2356946.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 146.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.23598520457744598, "kl": 0.03838397120125592, "learning_rate": 1.1666666666666668e-06, "loss": 0.0021, "num_tokens": 2357278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 146.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.002586456248536706, "kl": 0.0001680016503087245, "learning_rate": 1.1661111111111111e-06, "loss": 0.0, "num_tokens": 2357534.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 146.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071657029911875725, "kl": 0.041556863114237785, "learning_rate": 1.1655555555555555e-06, "loss": 0.0021, "num_tokens": 2358010.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 146.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.08107836544513702, "kl": 0.07935908064246178, "learning_rate": 1.165e-06, "loss": 0.004, "num_tokens": 2358378.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01385944988578558, "kl": 0.002355664037168026, "learning_rate": 1.1644444444444446e-06, "loss": 0.0001, "num_tokens": 2358676.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 146.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.040944285690784454, "kl": 0.00592193438205868, "learning_rate": 1.163888888888889e-06, "loss": 0.0003, "num_tokens": 2358965.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 146.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.007776004262268543, "kl": 0.21384289860725403, "learning_rate": 1.1633333333333333e-06, "loss": 0.0107, "num_tokens": 2359269.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 146.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 4.493001937866211, "kl": 0.552882194519043, "learning_rate": 1.162777777777778e-06, "loss": 0.0162, "num_tokens": 2359556.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 146.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.004648136906325817, "kl": 0.000764566648285836, "learning_rate": 1.1622222222222223e-06, "loss": 0.0, "num_tokens": 2359776.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 146.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04474576562643051, "kl": 0.007283863320481032, "learning_rate": 1.1616666666666668e-06, "loss": 0.0004, "num_tokens": 2360036.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 146.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.1658739149570465, "kl": 0.07423141412436962, "learning_rate": 1.1611111111111112e-06, "loss": 0.0044, "num_tokens": 2360304.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 146.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.13218767940998077, "kl": 0.015748276375234127, "learning_rate": 1.1605555555555555e-06, "loss": 0.0009, "num_tokens": 2360582.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 146.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.262145757675171, "kl": 0.24883877485990524, "learning_rate": 1.1600000000000001e-06, "loss": 0.0535, "num_tokens": 2360887.0, "reward": 4.375, "reward_std": 4.75, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.75, "step": 7913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 146.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.6470940709114075, "kl": 0.33305081725120544, "learning_rate": 1.1594444444444445e-06, "loss": 0.0167, "num_tokens": 2361176.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 146.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.031869783997535706, "kl": 0.07561489194631577, "learning_rate": 1.158888888888889e-06, "loss": 0.0038, "num_tokens": 2361613.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 146.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.005396062973886728, "kl": 0.003087639808654785, "learning_rate": 1.1583333333333334e-06, "loss": 0.0002, "num_tokens": 2361873.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 146.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0063311196863651276, "kl": 0.00043995678424835205, "learning_rate": 1.1577777777777778e-06, "loss": 0.0, "num_tokens": 2362085.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 146.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04132278636097908, "kl": 0.014865984208881855, "learning_rate": 1.1572222222222223e-06, "loss": 0.0007, "num_tokens": 2362380.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.029131799936294556, "kl": 0.008001567097380757, "learning_rate": 1.1566666666666667e-06, "loss": 0.0004, "num_tokens": 2362654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 146.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.010766127146780491, "kl": 0.001592994318343699, "learning_rate": 1.1561111111111112e-06, "loss": 0.0001, "num_tokens": 2362974.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 146.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.027138231322169304, "kl": 0.034218691289424896, "learning_rate": 1.1555555555555556e-06, "loss": 0.0018, "num_tokens": 2363246.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 146.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.007239580154419, "kl": 0.3660871535539627, "learning_rate": 1.1550000000000002e-06, "loss": 0.14, "num_tokens": 2363580.0, "reward": 2.25, "reward_std": 3.947572946548462, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.947573184967041, "step": 7922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 146.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02423933520913124, "kl": 0.006071562645956874, "learning_rate": 1.1544444444444445e-06, "loss": 0.0003, "num_tokens": 2363882.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 146.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0899619460105896, "kl": 0.025343958288431168, "learning_rate": 1.1538888888888889e-06, "loss": 0.0013, "num_tokens": 2364127.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 146.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.534064531326294, "kl": 0.0796164982020855, "learning_rate": 1.1533333333333334e-06, "loss": -0.1127, "num_tokens": 2364489.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 7925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 146.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.039630137383937836, "kl": 0.00985642010346055, "learning_rate": 1.152777777777778e-06, "loss": 0.0005, "num_tokens": 2364825.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 146.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.012772091664373875, "kl": 0.027771905064582825, "learning_rate": 1.1522222222222224e-06, "loss": 0.0014, "num_tokens": 2365041.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 146.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.9195377826690674, "kl": 0.017745744436979294, "learning_rate": 1.1516666666666667e-06, "loss": -0.0005, "num_tokens": 2365375.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 7928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 146.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.024593675509095192, "kl": 0.008817566558718681, "learning_rate": 1.151111111111111e-06, "loss": 0.0004, "num_tokens": 2365693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 146.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.025032712146639824, "kl": 0.005080802831798792, "learning_rate": 1.1505555555555556e-06, "loss": 0.0003, "num_tokens": 2365977.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 146.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.5371444225311279, "kl": 0.09103479888290167, "learning_rate": 1.1500000000000002e-06, "loss": 0.0051, "num_tokens": 2366286.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 146.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.10187318176031113, "kl": 0.014471164264250547, "learning_rate": 1.1494444444444446e-06, "loss": 0.0007, "num_tokens": 2366598.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 146.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03200627863407135, "kl": 0.11361271142959595, "learning_rate": 1.148888888888889e-06, "loss": 0.0056, "num_tokens": 2366943.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 146.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04412641376256943, "kl": 0.005467496928758919, "learning_rate": 1.1483333333333333e-06, "loss": 0.0004, "num_tokens": 2367188.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 146.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 9.924190089805052e-05, "kl": 7.539987564086914e-06, "learning_rate": 1.1477777777777778e-06, "loss": 0.0, "num_tokens": 2367408.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 146.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.006239771377295256, "kl": 0.006032479926943779, "learning_rate": 1.1472222222222224e-06, "loss": 0.0003, "num_tokens": 2367720.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 146.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02980108931660652, "kl": 0.026510940864682198, "learning_rate": 1.1466666666666668e-06, "loss": 0.0014, "num_tokens": 2368013.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.03869275003671646, "kl": 0.01315173041075468, "learning_rate": 1.1461111111111111e-06, "loss": 0.0007, "num_tokens": 2368289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 147.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03547130897641182, "kl": 0.0021692603186238557, "learning_rate": 1.1455555555555557e-06, "loss": 0.0001, "num_tokens": 2368545.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.026415562257170677, "kl": 0.018437612801790237, "learning_rate": 1.145e-06, "loss": 0.001, "num_tokens": 2368834.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 147.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.07855547964572906, "kl": 0.02937417756766081, "learning_rate": 1.1444444444444446e-06, "loss": 0.0015, "num_tokens": 2369053.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 147.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.052668847143650055, "kl": 0.008355358499102294, "learning_rate": 1.143888888888889e-06, "loss": 0.0004, "num_tokens": 2369313.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 147.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03581506758928299, "kl": 0.030561838299036026, "learning_rate": 1.1433333333333333e-06, "loss": 0.0017, "num_tokens": 2369585.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 147.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05036105960607529, "kl": 0.16230929642915726, "learning_rate": 1.142777777777778e-06, "loss": 0.0082, "num_tokens": 2369891.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 147.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.007930236868560314, "kl": 0.0008339583873748779, "learning_rate": 1.1422222222222223e-06, "loss": 0.0, "num_tokens": 2370187.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 147.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1673945188522339, "kl": 0.16015541553497314, "learning_rate": 1.1416666666666668e-06, "loss": 0.008, "num_tokens": 2370539.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 84.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 84.25, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 147.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.5215495824813843, "kl": 0.06804710440337658, "learning_rate": 1.1411111111111112e-06, "loss": 0.406, "num_tokens": 2371112.0, "reward": 5.050000190734863, "reward_std": 4.900000095367432, "rewards/reward_combined/mean": 5.050000190734863, "rewards/reward_combined/std": 4.900000095367432, "step": 7947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 26.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 147.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 5.979363918304443, "kl": 0.12001656368374825, "learning_rate": 1.1405555555555555e-06, "loss": 0.1358, "num_tokens": 2371433.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 147.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 7.129113510018215e-05, "kl": 6.034970283508301e-06, "learning_rate": 1.14e-06, "loss": 0.0, "num_tokens": 2371653.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 147.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 5.543222904205322, "kl": 0.040774498134851456, "learning_rate": 1.1394444444444445e-06, "loss": 0.3231, "num_tokens": 2371955.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 7950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.41722503304481506, "kl": 0.3436295986175537, "learning_rate": 1.138888888888889e-06, "loss": 0.0169, "num_tokens": 2372240.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 147.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.032862938940525055, "kl": 0.000976136332610622, "learning_rate": 1.1383333333333334e-06, "loss": 0.0001, "num_tokens": 2372453.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 147.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04064798355102539, "kl": 0.007034502690657973, "learning_rate": 1.137777777777778e-06, "loss": 0.0004, "num_tokens": 2372759.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.054113153368234634, "kl": 0.012060028035193682, "learning_rate": 1.1372222222222223e-06, "loss": 0.0006, "num_tokens": 2373043.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 147.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.011177632957696915, "kl": 0.003340020775794983, "learning_rate": 1.1366666666666667e-06, "loss": 0.0002, "num_tokens": 2373303.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 147.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.08062570542097092, "kl": 0.02501923404633999, "learning_rate": 1.1361111111111112e-06, "loss": 0.0013, "num_tokens": 2373626.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 147.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.018188398331403732, "kl": 0.002863895264454186, "learning_rate": 1.1355555555555558e-06, "loss": 0.0001, "num_tokens": 2373904.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 147.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 1.3936034440994263, "kl": 0.23149637877941132, "learning_rate": 1.1350000000000001e-06, "loss": 0.0138, "num_tokens": 2374142.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 147.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006761979777365923, "kl": 0.0014045805437490344, "learning_rate": 1.1344444444444445e-06, "loss": 0.0001, "num_tokens": 2374464.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 147.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.007707302458584309, "kl": 0.0012401975691318512, "learning_rate": 1.1338888888888889e-06, "loss": 0.0001, "num_tokens": 2374708.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 147.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.027015160769224167, "kl": 0.0018044114112854004, "learning_rate": 1.1333333333333334e-06, "loss": 0.0001, "num_tokens": 2374914.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 147.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 3.3563852310180664, "kl": 0.15176272485405207, "learning_rate": 1.132777777777778e-06, "loss": -0.0047, "num_tokens": 2375198.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 7962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 147.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03430643677711487, "kl": 0.005433408077806234, "learning_rate": 1.1322222222222223e-06, "loss": 0.0003, "num_tokens": 2375466.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 147.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03656664490699768, "kl": 0.005640504416078329, "learning_rate": 1.1316666666666667e-06, "loss": 0.0003, "num_tokens": 2375750.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 147.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.030927414074540138, "kl": 0.04468701034784317, "learning_rate": 1.131111111111111e-06, "loss": 0.0022, "num_tokens": 2376230.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 7965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 147.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.12405148893594742, "kl": 0.09022722765803337, "learning_rate": 1.1305555555555556e-06, "loss": 0.0046, "num_tokens": 2376599.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 147.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.059327106922864914, "kl": 0.0077102226205170155, "learning_rate": 1.1300000000000002e-06, "loss": 0.0004, "num_tokens": 2376929.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 147.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.037835828959941864, "kl": 0.041687825694680214, "learning_rate": 1.1294444444444445e-06, "loss": 0.0021, "num_tokens": 2377280.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 147.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.006393719464540482, "kl": 0.00067457853583619, "learning_rate": 1.128888888888889e-06, "loss": 0.0, "num_tokens": 2377499.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 147.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0672559067606926, "kl": 0.015039960853755474, "learning_rate": 1.1283333333333333e-06, "loss": 0.0008, "num_tokens": 2377842.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 147.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.018320487812161446, "kl": 0.0010687048779800534, "learning_rate": 1.1277777777777778e-06, "loss": 0.0001, "num_tokens": 2378076.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 147.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04312984645366669, "kl": 0.0039011070039123297, "learning_rate": 1.1272222222222224e-06, "loss": 0.0002, "num_tokens": 2378366.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 147.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03429887071251869, "kl": 0.01907608099281788, "learning_rate": 1.1266666666666667e-06, "loss": 0.0009, "num_tokens": 2378655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 147.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.948831796646118, "kl": 0.1328934021294117, "learning_rate": 1.126111111111111e-06, "loss": 0.1195, "num_tokens": 2379038.0, "reward": 6.0, "reward_std": 3.34165620803833, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.34165620803833, "step": 7974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 147.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.2195412963628769, "kl": 0.027799320872873068, "learning_rate": 1.1255555555555557e-06, "loss": 0.0012, "num_tokens": 2379336.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 147.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.026546813547611237, "kl": 0.0041682415758259594, "learning_rate": 1.125e-06, "loss": 0.0002, "num_tokens": 2379645.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 147.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.009178109467029572, "kl": 0.22648935765028, "learning_rate": 1.1244444444444446e-06, "loss": 0.0113, "num_tokens": 2379947.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 147.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.16284054517745972, "kl": 0.4557344913482666, "learning_rate": 1.123888888888889e-06, "loss": 0.0228, "num_tokens": 2380231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 147.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.1117890477180481, "kl": 0.019367315340787172, "learning_rate": 1.1233333333333333e-06, "loss": 0.001, "num_tokens": 2380549.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 147.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.327511727809906, "kl": 0.09328391030430794, "learning_rate": 1.1227777777777779e-06, "loss": 0.0049, "num_tokens": 2380853.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 147.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.5302934646606445, "kl": 0.23049628734588623, "learning_rate": 1.1222222222222222e-06, "loss": 0.2571, "num_tokens": 2381168.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 7981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 147.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.769524574279785, "kl": 1.2054095305502415, "learning_rate": 1.1216666666666668e-06, "loss": 0.1083, "num_tokens": 2381426.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 7982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 147.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.07996983826160431, "kl": 0.027620389126241207, "learning_rate": 1.1211111111111112e-06, "loss": 0.0014, "num_tokens": 2381720.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 147.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.008525537326931953, "kl": 0.00886956648901105, "learning_rate": 1.1205555555555557e-06, "loss": 0.0004, "num_tokens": 2381992.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 147.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03856603056192398, "kl": 0.0023510329192504287, "learning_rate": 1.12e-06, "loss": 0.0001, "num_tokens": 2382254.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 147.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.033626701682806015, "kl": 0.010961098596453667, "learning_rate": 1.1194444444444444e-06, "loss": 0.0005, "num_tokens": 2382514.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 7986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 147.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.152275562286377, "kl": 0.13178277015686035, "learning_rate": 1.118888888888889e-06, "loss": -0.001, "num_tokens": 2382818.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 7987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 147.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.5729265213012695, "kl": 0.03442484885454178, "learning_rate": 1.1183333333333336e-06, "loss": 0.144, "num_tokens": 2383187.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 7988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 147.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.005631977692246437, "kl": 0.00031691789627075195, "learning_rate": 1.117777777777778e-06, "loss": 0.0, "num_tokens": 2383399.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 147.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.12051508575677872, "kl": 0.12766065821051598, "learning_rate": 1.1172222222222223e-06, "loss": 0.0063, "num_tokens": 2383785.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 147.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.7379510402679443, "kl": 0.08575349766761065, "learning_rate": 1.1166666666666666e-06, "loss": 0.0046, "num_tokens": 2384147.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 7991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 148.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.021751103922724724, "kl": 0.011279094032943249, "learning_rate": 1.1161111111111112e-06, "loss": 0.0006, "num_tokens": 2384459.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 148.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01500137709081173, "kl": 0.00031266808218788356, "learning_rate": 1.1155555555555558e-06, "loss": 0.0, "num_tokens": 2384715.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 7993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 148.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.12839633226394653, "kl": 0.12877832353115082, "learning_rate": 1.1150000000000001e-06, "loss": 0.0069, "num_tokens": 2385027.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.023710252717137337, "kl": 0.009627685882151127, "learning_rate": 1.1144444444444445e-06, "loss": 0.0005, "num_tokens": 2385299.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 148.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.6606163382530212, "kl": 0.14714330434799194, "learning_rate": 1.1138888888888888e-06, "loss": 0.0078, "num_tokens": 2385573.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 4.340942859649658, "kl": 2.233636226505041, "learning_rate": 1.1133333333333334e-06, "loss": -0.0998, "num_tokens": 2385813.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 7997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.010857431218028069, "kl": 0.008574992418289185, "learning_rate": 1.112777777777778e-06, "loss": 0.0004, "num_tokens": 2386049.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 7998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 148.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.022151723504066467, "kl": 0.004494261462241411, "learning_rate": 1.1122222222222223e-06, "loss": 0.0002, "num_tokens": 2386333.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 7999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 148.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02647545374929905, "kl": 0.001660251640714705, "learning_rate": 1.1116666666666667e-06, "loss": 0.0001, "num_tokens": 2386593.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 148.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.225379228591919, "kl": 0.035458141937851906, "learning_rate": 1.111111111111111e-06, "loss": 0.0874, "num_tokens": 2386941.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 148.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.016731087118387222, "kl": 0.009441091679036617, "learning_rate": 1.1105555555555556e-06, "loss": 0.0005, "num_tokens": 2387257.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 148.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.4517287611961365, "kl": 0.0673406207934022, "learning_rate": 1.1100000000000002e-06, "loss": 0.0038, "num_tokens": 2387538.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03300117701292038, "kl": 0.015439941547811031, "learning_rate": 1.1094444444444445e-06, "loss": 0.0008, "num_tokens": 2387806.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 148.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.021800559014081955, "kl": 0.011259744875133038, "learning_rate": 1.1088888888888889e-06, "loss": 0.0006, "num_tokens": 2388118.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 148.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.00792778842151165, "kl": 0.0015001397696323693, "learning_rate": 1.1083333333333335e-06, "loss": 0.0001, "num_tokens": 2388440.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 148.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.003114508930593729, "kl": 0.28203630447387695, "learning_rate": 1.1077777777777778e-06, "loss": 0.0141, "num_tokens": 2388728.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 148.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0360029861330986, "kl": 0.010356476530432701, "learning_rate": 1.1072222222222224e-06, "loss": 0.0005, "num_tokens": 2388988.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 148.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.04260553792119026, "kl": 0.019408698193728924, "learning_rate": 1.1066666666666667e-06, "loss": 0.001, "num_tokens": 2389282.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 148.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.15192146599292755, "kl": 0.4483216553926468, "learning_rate": 1.106111111111111e-06, "loss": 0.0224, "num_tokens": 2389566.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03029957413673401, "kl": 0.002653632138390094, "learning_rate": 1.1055555555555557e-06, "loss": 0.0001, "num_tokens": 2389864.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 148.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.14908093214035034, "kl": 0.11877144128084183, "learning_rate": 1.105e-06, "loss": 0.0059, "num_tokens": 2390207.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.004349487833678722, "kl": 0.00042939186096191406, "learning_rate": 1.1044444444444446e-06, "loss": 0.0, "num_tokens": 2390419.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 148.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.022892596200108528, "kl": 0.005643333541229367, "learning_rate": 1.103888888888889e-06, "loss": 0.0003, "num_tokens": 2390724.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.75, "completions/mean_terminated_length": 42.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 148.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.015953293070197105, "kl": 0.07151110470294952, "learning_rate": 1.1033333333333335e-06, "loss": 0.0036, "num_tokens": 2391163.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 148.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.005767535883933306, "kl": 0.0003485828638076782, "learning_rate": 1.1027777777777779e-06, "loss": 0.0, "num_tokens": 2391375.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03587206453084946, "kl": 0.0286733815446496, "learning_rate": 1.1022222222222222e-06, "loss": 0.0014, "num_tokens": 2391594.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 148.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.015005029737949371, "kl": 0.026885409839451313, "learning_rate": 1.1016666666666668e-06, "loss": 0.0012, "num_tokens": 2391984.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 148.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.3969571590423584, "kl": 0.1655692234635353, "learning_rate": 1.1011111111111113e-06, "loss": 0.0985, "num_tokens": 2392294.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 148.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07614869624376297, "kl": 0.014222525293007493, "learning_rate": 1.1005555555555557e-06, "loss": 0.0007, "num_tokens": 2392625.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 148.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 6.443507671356201, "kl": 0.02599603630369529, "learning_rate": 1.1e-06, "loss": 0.0548, "num_tokens": 2392902.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 148.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.01797747239470482, "kl": 0.17019884288311005, "learning_rate": 1.0994444444444444e-06, "loss": 0.0085, "num_tokens": 2393212.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 148.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.033470768481492996, "kl": 0.002352483570575714, "learning_rate": 1.098888888888889e-06, "loss": 0.0001, "num_tokens": 2393420.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 148.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.15900664031505585, "kl": 0.1910335272550583, "learning_rate": 1.0983333333333335e-06, "loss": 0.0096, "num_tokens": 2393704.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 148.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05840180814266205, "kl": 0.009562229737639427, "learning_rate": 1.097777777777778e-06, "loss": 0.0005, "num_tokens": 2394035.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04367050528526306, "kl": 0.0027069837087765336, "learning_rate": 1.0972222222222223e-06, "loss": 0.0001, "num_tokens": 2394254.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 148.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 1.9841219909721985e-05, "kl": 4.693865776062012e-06, "learning_rate": 1.0966666666666666e-06, "loss": 0.0, "num_tokens": 2394474.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 148.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.13521882891654968, "kl": 0.02333696885034442, "learning_rate": 1.0961111111111112e-06, "loss": 0.0013, "num_tokens": 2394759.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.5, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 148.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.055776964873075485, "kl": 0.04010334238409996, "learning_rate": 1.0955555555555557e-06, "loss": 0.002, "num_tokens": 2395265.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 148.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.14648155868053436, "kl": 0.07348718494176865, "learning_rate": 1.095e-06, "loss": 0.0036, "num_tokens": 2395559.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 148.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.026080312207341194, "kl": 0.0021046996116638184, "learning_rate": 1.0944444444444445e-06, "loss": 0.0001, "num_tokens": 2395819.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 148.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.008622129447758198, "kl": 0.0009260617371182889, "learning_rate": 1.0938888888888888e-06, "loss": 0.0, "num_tokens": 2396053.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 148.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.03914367035031319, "kl": 0.002707693027332425, "learning_rate": 1.0933333333333334e-06, "loss": 0.0001, "num_tokens": 2396315.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 148.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.023003587499260902, "kl": 0.01848487727693282, "learning_rate": 1.092777777777778e-06, "loss": 0.0009, "num_tokens": 2396603.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 148.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03152083605527878, "kl": 0.21905750781297684, "learning_rate": 1.0922222222222223e-06, "loss": 0.011, "num_tokens": 2396907.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 148.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.016932422295212746, "kl": 0.003942613839171827, "learning_rate": 1.0916666666666667e-06, "loss": 0.0002, "num_tokens": 2397207.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 148.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.026091285049915314, "kl": 0.07018796727061272, "learning_rate": 1.0911111111111112e-06, "loss": 0.0035, "num_tokens": 2397581.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 148.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.27213025093078613, "kl": 0.08176128007471561, "learning_rate": 1.0905555555555556e-06, "loss": 0.0042, "num_tokens": 2397931.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 148.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.08905933052301407, "kl": 0.011676993686705828, "learning_rate": 1.0900000000000002e-06, "loss": 0.0006, "num_tokens": 2398238.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 148.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.6353583335876465, "kl": 0.08034497499465942, "learning_rate": 1.0894444444444445e-06, "loss": -0.1861, "num_tokens": 2398608.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 148.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.06061466038227081, "kl": 0.20739330165088177, "learning_rate": 1.0888888888888889e-06, "loss": 0.0095, "num_tokens": 2398943.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 148.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02764630690217018, "kl": 0.0016901325434446335, "learning_rate": 1.0883333333333334e-06, "loss": 0.0001, "num_tokens": 2399209.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 148.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.012586312368512154, "kl": 0.0036425739526748657, "learning_rate": 1.0877777777777778e-06, "loss": 0.0002, "num_tokens": 2399469.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 148.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.009735696017742157, "kl": 0.004329459916334599, "learning_rate": 1.0872222222222224e-06, "loss": 0.0002, "num_tokens": 2399757.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 148.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.050337161868810654, "kl": 0.011873728595674038, "learning_rate": 1.0866666666666667e-06, "loss": 0.0006, "num_tokens": 2400091.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 149.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.02721092291176319, "kl": 0.012720332480967045, "learning_rate": 1.0861111111111113e-06, "loss": 0.0006, "num_tokens": 2400383.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 149.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05953369662165642, "kl": 0.010873596649616957, "learning_rate": 1.0855555555555556e-06, "loss": 0.0005, "num_tokens": 2400655.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 149.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06631327420473099, "kl": 0.03708105534315109, "learning_rate": 1.085e-06, "loss": 0.0019, "num_tokens": 2400927.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 149.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.030340394005179405, "kl": 0.001940445858053863, "learning_rate": 1.0844444444444446e-06, "loss": 0.0001, "num_tokens": 2401198.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 149.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11697252094745636, "kl": 0.024054283276200294, "learning_rate": 1.0838888888888891e-06, "loss": 0.0012, "num_tokens": 2401483.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 149.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.11130638420581818, "kl": 0.010960236191749573, "learning_rate": 1.0833333333333335e-06, "loss": 0.0005, "num_tokens": 2401691.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 149.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.3021848797798157, "kl": 0.0375077078351751, "learning_rate": 1.0827777777777778e-06, "loss": 0.0021, "num_tokens": 2402017.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 149.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.008301809430122375, "kl": 0.2389986366033554, "learning_rate": 1.0822222222222222e-06, "loss": 0.0119, "num_tokens": 2402317.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 149.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.007187922019511461, "kl": 0.0002980411081807688, "learning_rate": 1.0816666666666668e-06, "loss": 0.0, "num_tokens": 2402573.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.75, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 149.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.7085152864456177, "kl": 0.05436534434556961, "learning_rate": 1.0811111111111113e-06, "loss": 0.0685, "num_tokens": 2403040.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 8055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 149.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04006112366914749, "kl": 0.0027190102264285088, "learning_rate": 1.0805555555555557e-06, "loss": 0.0001, "num_tokens": 2403283.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 149.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 4.134734153747559, "kl": 0.09610010124742985, "learning_rate": 1.08e-06, "loss": 0.0265, "num_tokens": 2403581.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 149.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05130557715892792, "kl": 0.17764735221862793, "learning_rate": 1.0794444444444444e-06, "loss": 0.0089, "num_tokens": 2403891.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 149.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.64314341545105, "kl": 0.09789001569151878, "learning_rate": 1.078888888888889e-06, "loss": -0.0603, "num_tokens": 2404246.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 149.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 5.110212326049805, "kl": 0.16411976516246796, "learning_rate": 1.0783333333333335e-06, "loss": 0.194, "num_tokens": 2404549.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 149.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03623785078525543, "kl": 0.005780116654932499, "learning_rate": 1.0777777777777779e-06, "loss": 0.0003, "num_tokens": 2404817.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8061 }, { "clip_ratio/high_max": 0.006329114083200693, "clip_ratio/high_mean": 0.006329114083200693, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006329114083200693, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 149.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.317194938659668, "kl": 0.09876434877514839, "learning_rate": 1.0772222222222222e-06, "loss": 0.0046, "num_tokens": 2405175.0, "reward": 5.25, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 2.598076105117798, "step": 8062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 149.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.010159228928387165, "kl": 0.004583432339131832, "learning_rate": 1.0766666666666666e-06, "loss": 0.0002, "num_tokens": 2405463.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 149.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.09079106897115707, "kl": 0.03502646088600159, "learning_rate": 1.0761111111111112e-06, "loss": 0.0017, "num_tokens": 2405828.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 149.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.028808090835809708, "kl": 0.008423869498074055, "learning_rate": 1.0755555555555557e-06, "loss": 0.0004, "num_tokens": 2406162.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.25, "completions/mean_terminated_length": 2.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 149.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 19.12066650390625, "kl": 0.08852396160364151, "learning_rate": 1.075e-06, "loss": 0.1143, "num_tokens": 2406375.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 149.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.036222077906131744, "kl": 0.0031972515862435102, "learning_rate": 1.0744444444444444e-06, "loss": 0.0002, "num_tokens": 2406689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 149.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.00947063323110342, "kl": 0.003479044884443283, "learning_rate": 1.073888888888889e-06, "loss": 0.0002, "num_tokens": 2406949.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 149.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04376513510942459, "kl": 0.01266811415553093, "learning_rate": 1.0733333333333334e-06, "loss": 0.0006, "num_tokens": 2407268.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 149.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.01830294355750084, "kl": 0.00024837255477905273, "learning_rate": 1.072777777777778e-06, "loss": 0.0, "num_tokens": 2407480.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 149.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.397733688354492, "kl": 0.20801132917404175, "learning_rate": 1.0722222222222223e-06, "loss": -0.0036, "num_tokens": 2407797.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 149.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 0.7678095102310181, "kl": 0.5341393223498017, "learning_rate": 1.0716666666666666e-06, "loss": 0.0277, "num_tokens": 2408079.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 149.5, "frac_reward_zero_std": 0.0, "grad_norm": 10.365382194519043, "kl": 2.936693798750639, "learning_rate": 1.0711111111111112e-06, "loss": 0.0605, "num_tokens": 2408437.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 149.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.29942160844802856, "kl": 0.041742284782230854, "learning_rate": 1.0705555555555556e-06, "loss": 0.0022, "num_tokens": 2408698.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 30.0, "completions/mean_terminated_length": 30.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 149.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.8690240383148193, "kl": 0.1010272428393364, "learning_rate": 1.0700000000000001e-06, "loss": -0.116, "num_tokens": 2409034.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 8075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 149.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.10850382596254349, "kl": 0.44496341049671173, "learning_rate": 1.0694444444444445e-06, "loss": 0.0222, "num_tokens": 2409318.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 149.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.48185014724731445, "kl": 0.037661001086235046, "learning_rate": 1.068888888888889e-06, "loss": 0.0028, "num_tokens": 2409568.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 149.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 1.889609932899475, "kl": 0.15582336485385895, "learning_rate": 1.0683333333333334e-06, "loss": -0.0376, "num_tokens": 2409968.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 149.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 5.5776047706604, "kl": 0.027710434049367905, "learning_rate": 1.0677777777777778e-06, "loss": 0.3529, "num_tokens": 2410230.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8079 }, { "clip_ratio/high_max": 0.0055555556900799274, "clip_ratio/high_mean": 0.0055555556900799274, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0055555556900799274, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 149.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 1.6658891439437866, "kl": 0.08820047602057457, "learning_rate": 1.0672222222222223e-06, "loss": 0.0024, "num_tokens": 2410610.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 8080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 149.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033090515062212944, "kl": 0.2819212079048157, "learning_rate": 1.066666666666667e-06, "loss": 0.0141, "num_tokens": 2410898.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 149.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0186783354729414, "kl": 0.004353107186034322, "learning_rate": 1.0661111111111113e-06, "loss": 0.0002, "num_tokens": 2411202.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 149.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03170473128557205, "kl": 0.011945093050599098, "learning_rate": 1.0655555555555556e-06, "loss": 0.0006, "num_tokens": 2411538.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 149.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.6921076774597168, "kl": 0.19604428857564926, "learning_rate": 1.065e-06, "loss": 0.0094, "num_tokens": 2411886.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 149.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.10207746922969818, "kl": 0.014713948592543602, "learning_rate": 1.0644444444444445e-06, "loss": 0.0007, "num_tokens": 2412148.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 149.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0939103290438652, "kl": 0.014173194591421634, "learning_rate": 1.063888888888889e-06, "loss": 0.001, "num_tokens": 2412374.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 149.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 2.2044599056243896, "kl": 0.007293070666491985, "learning_rate": 1.0633333333333335e-06, "loss": 0.0424, "num_tokens": 2412675.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 149.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.064008951187134, "kl": 0.37147089652717113, "learning_rate": 1.0627777777777778e-06, "loss": -0.0231, "num_tokens": 2412998.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 149.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02894510328769684, "kl": 0.004416832467541099, "learning_rate": 1.0622222222222222e-06, "loss": 0.0002, "num_tokens": 2413282.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 73.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 73.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 149.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.174715995788574, "kl": 0.03211288258899003, "learning_rate": 1.0616666666666667e-06, "loss": 0.4495, "num_tokens": 2413794.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 149.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01773948036134243, "kl": 0.0008611520170234144, "learning_rate": 1.0611111111111113e-06, "loss": 0.0, "num_tokens": 2414086.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 149.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05563555657863617, "kl": 0.027335873804986477, "learning_rate": 1.0605555555555557e-06, "loss": 0.0014, "num_tokens": 2414381.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 149.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 1.9129476640955545e-05, "kl": 4.217028617858887e-06, "learning_rate": 1.06e-06, "loss": 0.0, "num_tokens": 2414601.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 149.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.9015672206878662, "kl": 0.15825163573026657, "learning_rate": 1.0594444444444444e-06, "loss": 0.0078, "num_tokens": 2414895.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 149.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.014812180772423744, "kl": 0.028030164539813995, "learning_rate": 1.058888888888889e-06, "loss": 0.0014, "num_tokens": 2415111.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 149.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02588539570569992, "kl": 0.0034061017213389277, "learning_rate": 1.0583333333333335e-06, "loss": 0.0002, "num_tokens": 2415409.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 149.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.04678353667259216, "kl": 0.01156777236610651, "learning_rate": 1.0577777777777779e-06, "loss": 0.0006, "num_tokens": 2415679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010638297535479069, "clip_ratio/low_min": 0.010638297535479069, "clip_ratio/region_mean": 0.010638297535479069, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 149.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.7638955116271973, "kl": 0.07384095899760723, "learning_rate": 1.0572222222222222e-06, "loss": 0.0127, "num_tokens": 2416021.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 149.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.016534321010112762, "kl": 0.001203491527121514, "learning_rate": 1.0566666666666668e-06, "loss": 0.0001, "num_tokens": 2416283.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 150.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.021590018644928932, "kl": 0.011054255068302155, "learning_rate": 1.0561111111111111e-06, "loss": 0.0006, "num_tokens": 2416595.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 150.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.377063751220703, "kl": 0.10812846943736076, "learning_rate": 1.0555555555555557e-06, "loss": 0.1443, "num_tokens": 2416914.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 8101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 150.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.010681210085749626, "kl": 0.00026448070275364444, "learning_rate": 1.055e-06, "loss": 0.0, "num_tokens": 2417170.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 150.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.003704081056639552, "kl": 0.0006783672142773867, "learning_rate": 1.0544444444444444e-06, "loss": 0.0, "num_tokens": 2417435.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 150.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.9187875986099243, "kl": 0.11140557378530502, "learning_rate": 1.053888888888889e-06, "loss": 0.0529, "num_tokens": 2417780.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 8104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 150.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 2.718015670776367, "kl": 0.06924588605761528, "learning_rate": 1.0533333333333333e-06, "loss": -0.0462, "num_tokens": 2418066.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 150.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.15096786618232727, "kl": 0.007373049855232239, "learning_rate": 1.052777777777778e-06, "loss": 0.0004, "num_tokens": 2418313.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 150.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04597393795847893, "kl": 0.005470120697282255, "learning_rate": 1.0522222222222223e-06, "loss": 0.0003, "num_tokens": 2418636.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 150.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.042822159826755524, "kl": 0.019783225725404918, "learning_rate": 1.0516666666666668e-06, "loss": 0.001, "num_tokens": 2418925.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 150.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0357607938349247, "kl": 0.0018367469310760498, "learning_rate": 1.0511111111111112e-06, "loss": 0.0001, "num_tokens": 2419137.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 150.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06890654563903809, "kl": 0.003986673022154719, "learning_rate": 1.0505555555555555e-06, "loss": 0.0002, "num_tokens": 2419356.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 150.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.035229381173849106, "kl": 0.0029021799564361572, "learning_rate": 1.0500000000000001e-06, "loss": 0.0001, "num_tokens": 2419608.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 150.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01247827336192131, "kl": 0.005485603396664374, "learning_rate": 1.0494444444444447e-06, "loss": 0.0003, "num_tokens": 2419868.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.3714232444763184, "kl": 0.02178830746561289, "learning_rate": 1.048888888888889e-06, "loss": -0.0186, "num_tokens": 2420165.0, "reward": 4.0, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.690415859222412, "step": 8113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 150.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05673452466726303, "kl": 0.015659386292099953, "learning_rate": 1.0483333333333334e-06, "loss": 0.0008, "num_tokens": 2420498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 150.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.122441053390503, "kl": 0.05230129137635231, "learning_rate": 1.0477777777777777e-06, "loss": 0.0088, "num_tokens": 2420959.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 8115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 150.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.21917429566383362, "kl": 0.027991246432065964, "learning_rate": 1.0472222222222223e-06, "loss": 0.0014, "num_tokens": 2421256.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.16448789834976196, "kl": 0.07204729691147804, "learning_rate": 1.0466666666666669e-06, "loss": 0.0036, "num_tokens": 2421547.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 150.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.1504290103912354, "kl": 0.1534404680132866, "learning_rate": 1.0461111111111112e-06, "loss": 0.0542, "num_tokens": 2421895.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 150.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07963387668132782, "kl": 0.0823584496974945, "learning_rate": 1.0455555555555556e-06, "loss": 0.004, "num_tokens": 2422328.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 150.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.027337368577718735, "kl": 0.0052055236883461475, "learning_rate": 1.045e-06, "loss": 0.0003, "num_tokens": 2422628.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 150.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.21484261751174927, "kl": 0.158009871840477, "learning_rate": 1.0444444444444445e-06, "loss": 0.0077, "num_tokens": 2422980.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 150.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.09937156736850739, "kl": 0.4423440098762512, "learning_rate": 1.043888888888889e-06, "loss": 0.0221, "num_tokens": 2423264.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 150.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02670072577893734, "kl": 0.053618621081113815, "learning_rate": 1.0433333333333334e-06, "loss": 0.0029, "num_tokens": 2423539.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.003053064923733473, "kl": 0.001869989326223731, "learning_rate": 1.0427777777777778e-06, "loss": 0.0001, "num_tokens": 2423819.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 150.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02684507519006729, "kl": 0.012549413833767176, "learning_rate": 1.0422222222222221e-06, "loss": 0.0006, "num_tokens": 2424079.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.15604960918426514, "kl": 0.030460447072982788, "learning_rate": 1.0416666666666667e-06, "loss": 0.0015, "num_tokens": 2424370.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.05558597296476364, "kl": 0.23321950435638428, "learning_rate": 1.0411111111111113e-06, "loss": 0.0116, "num_tokens": 2424672.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 150.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.5723209381103516, "kl": 0.010954039636999369, "learning_rate": 1.0405555555555556e-06, "loss": 0.0519, "num_tokens": 2424987.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 150.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.1781792789697647, "kl": 0.03459032438695431, "learning_rate": 1.04e-06, "loss": 0.0019, "num_tokens": 2425300.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8129 }, { "clip_ratio/high_max": 0.007936508394777775, "clip_ratio/high_mean": 0.007936508394777775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007936508394777775, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 150.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.080113172531128, "kl": 0.3033043257892132, "learning_rate": 1.0394444444444446e-06, "loss": -0.0942, "num_tokens": 2425647.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 150.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.009677951224148273, "kl": 0.0035902857780456543, "learning_rate": 1.038888888888889e-06, "loss": 0.0002, "num_tokens": 2425907.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 150.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 2.5991265773773193, "kl": 0.05107986927032471, "learning_rate": 1.0383333333333335e-06, "loss": 0.1619, "num_tokens": 2426313.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 150.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.21803611516952515, "kl": 0.036678534001111984, "learning_rate": 1.0377777777777778e-06, "loss": 0.002, "num_tokens": 2426592.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 150.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.021617362275719643, "kl": 0.004344282206147909, "learning_rate": 1.0372222222222222e-06, "loss": 0.0002, "num_tokens": 2426876.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 150.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.1845757961273193, "kl": 0.027708614943549037, "learning_rate": 1.0366666666666668e-06, "loss": 0.1252, "num_tokens": 2427218.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 150.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.07422053813934326, "kl": 0.14514899253845215, "learning_rate": 1.0361111111111111e-06, "loss": 0.0071, "num_tokens": 2427531.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 150.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04654557630419731, "kl": 0.011460009962320328, "learning_rate": 1.0355555555555557e-06, "loss": 0.0006, "num_tokens": 2427847.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 150.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.045944441109895706, "kl": 0.028395620174705982, "learning_rate": 1.035e-06, "loss": 0.0014, "num_tokens": 2428193.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 150.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06850206106901169, "kl": 0.02160063199698925, "learning_rate": 1.0344444444444446e-06, "loss": 0.0012, "num_tokens": 2428532.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 150.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.3157109320163727, "kl": 0.054145327769219875, "learning_rate": 1.033888888888889e-06, "loss": 0.0028, "num_tokens": 2428880.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 150.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.16773797571659088, "kl": 0.17762605845928192, "learning_rate": 1.0333333333333333e-06, "loss": 0.0089, "num_tokens": 2429169.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 150.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.021096736192703247, "kl": 0.0002917870879173279, "learning_rate": 1.0327777777777779e-06, "loss": 0.0, "num_tokens": 2429381.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 150.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.022632930427789688, "kl": 0.010799581184983253, "learning_rate": 1.0322222222222225e-06, "loss": 0.0005, "num_tokens": 2429693.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 150.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.009848233312368393, "kl": 0.0008404595428146422, "learning_rate": 1.0316666666666668e-06, "loss": 0.0, "num_tokens": 2429928.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 150.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.053247906267642975, "kl": 0.008278374094516039, "learning_rate": 1.0311111111111112e-06, "loss": 0.0004, "num_tokens": 2430200.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 150.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 4.005217124358751e-05, "kl": 4.902482032775879e-06, "learning_rate": 1.0305555555555555e-06, "loss": 0.0, "num_tokens": 2430420.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 150.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.013444446958601475, "kl": 0.0016281711868941784, "learning_rate": 1.03e-06, "loss": 0.0001, "num_tokens": 2430708.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 150.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02699807472527027, "kl": 0.027362089604139328, "learning_rate": 1.0294444444444447e-06, "loss": 0.0014, "num_tokens": 2430927.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 150.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.07258251309394836, "kl": 0.0339365511899814, "learning_rate": 1.028888888888889e-06, "loss": 0.0021, "num_tokens": 2431232.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 150.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.020388171076774597, "kl": 0.010597873944789171, "learning_rate": 1.0283333333333334e-06, "loss": 0.0005, "num_tokens": 2431504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8150 }, { "clip_ratio/high_max": 0.007692307699471712, "clip_ratio/high_mean": 0.007692307699471712, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007692307699471712, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 150.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 1.2720868587493896, "kl": 0.09709953889250755, "learning_rate": 1.0277777777777777e-06, "loss": -0.1117, "num_tokens": 2431903.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 150.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033748243004083633, "kl": 0.2819448858499527, "learning_rate": 1.0272222222222223e-06, "loss": 0.0141, "num_tokens": 2432191.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 150.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.014091315679252148, "kl": 0.007676050066947937, "learning_rate": 1.0266666666666669e-06, "loss": 0.0004, "num_tokens": 2432427.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 151.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.19207517802715302, "kl": 0.012144051492214203, "learning_rate": 1.0261111111111112e-06, "loss": 0.0008, "num_tokens": 2432639.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 151.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0138387531042099, "kl": 0.027748234570026398, "learning_rate": 1.0255555555555556e-06, "loss": 0.0014, "num_tokens": 2432855.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 151.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.006152682937681675, "kl": 0.21370196342468262, "learning_rate": 1.025e-06, "loss": 0.0107, "num_tokens": 2433159.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 151.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.022415947169065475, "kl": 0.005774842109531164, "learning_rate": 1.0244444444444445e-06, "loss": 0.0003, "num_tokens": 2433465.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 151.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 5.014830112457275, "kl": 0.64455546438694, "learning_rate": 1.023888888888889e-06, "loss": 0.1068, "num_tokens": 2433841.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 151.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.023070884868502617, "kl": 0.002350162831135094, "learning_rate": 1.0233333333333334e-06, "loss": 0.0001, "num_tokens": 2434114.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 151.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.026797126978635788, "kl": 0.003947686403989792, "learning_rate": 1.0227777777777778e-06, "loss": 0.0002, "num_tokens": 2434374.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 151.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.254838228225708, "kl": 0.3743899315595627, "learning_rate": 1.0222222222222223e-06, "loss": -0.0138, "num_tokens": 2434656.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 151.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 1.2282724128453992e-05, "kl": 4.403293132781982e-06, "learning_rate": 1.0216666666666667e-06, "loss": 0.0, "num_tokens": 2434876.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 151.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.025473250076174736, "kl": 0.031202528887661174, "learning_rate": 1.0211111111111113e-06, "loss": 0.0016, "num_tokens": 2435168.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 151.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1068192720413208, "kl": 0.02017479855567217, "learning_rate": 1.0205555555555556e-06, "loss": 0.001, "num_tokens": 2435460.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 151.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.058253102004528046, "kl": 0.011116651818156242, "learning_rate": 1.02e-06, "loss": 0.0006, "num_tokens": 2435780.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 151.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.010628771036863327, "kl": 0.0015805106377229095, "learning_rate": 1.0194444444444445e-06, "loss": 0.0001, "num_tokens": 2436101.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 151.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.07260715216398239, "kl": 0.03462135570589453, "learning_rate": 1.018888888888889e-06, "loss": 0.0021, "num_tokens": 2436406.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 151.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.02196148969233036, "kl": 0.002241374895675108, "learning_rate": 1.0183333333333335e-06, "loss": 0.0001, "num_tokens": 2436696.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 151.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.00387903256341815, "kl": 0.0004249095800332725, "learning_rate": 1.0177777777777778e-06, "loss": 0.0, "num_tokens": 2436916.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 151.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05144967511296272, "kl": 0.009612219873815775, "learning_rate": 1.0172222222222224e-06, "loss": 0.0005, "num_tokens": 2437196.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 151.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.23658540844917297, "kl": 0.06108544394373894, "learning_rate": 1.0166666666666667e-06, "loss": 0.003, "num_tokens": 2437658.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 151.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.002768005710095167, "kl": 0.2822139859199524, "learning_rate": 1.016111111111111e-06, "loss": 0.0141, "num_tokens": 2437946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 151.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.18194431066513062, "kl": 0.11187092214822769, "learning_rate": 1.0155555555555557e-06, "loss": 0.0055, "num_tokens": 2438250.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 151.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.06283332407474518, "kl": 0.0033328980207443237, "learning_rate": 1.0150000000000002e-06, "loss": 0.0002, "num_tokens": 2438458.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 151.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05047590658068657, "kl": 0.08335789665579796, "learning_rate": 1.0144444444444446e-06, "loss": 0.0041, "num_tokens": 2438890.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 151.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.00035473291063681245, "kl": 0.00010860860493266955, "learning_rate": 1.013888888888889e-06, "loss": 0.0, "num_tokens": 2439146.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 151.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06295181065797806, "kl": 0.006658313097432256, "learning_rate": 1.0133333333333333e-06, "loss": 0.0003, "num_tokens": 2439432.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 151.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0133142014965415, "kl": 0.007823064923286438, "learning_rate": 1.0127777777777779e-06, "loss": 0.0004, "num_tokens": 2439668.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 151.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02700953744351864, "kl": 0.005251965951174498, "learning_rate": 1.0122222222222224e-06, "loss": 0.0003, "num_tokens": 2439936.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 151.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05628479644656181, "kl": 0.051860153675079346, "learning_rate": 1.0116666666666668e-06, "loss": 0.0028, "num_tokens": 2440265.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 151.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.1263783574104309, "kl": 0.023624584544450045, "learning_rate": 1.0111111111111111e-06, "loss": 0.0012, "num_tokens": 2440539.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 151.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.007278751581907272, "kl": 0.0007654130458831787, "learning_rate": 1.0105555555555555e-06, "loss": 0.0, "num_tokens": 2440751.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 151.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.4509999454021454, "kl": 0.08229318633675575, "learning_rate": 1.01e-06, "loss": 0.0046, "num_tokens": 2441037.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 151.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.01150720939040184, "kl": 0.0026005753315985203, "learning_rate": 1.0094444444444446e-06, "loss": 0.0001, "num_tokens": 2441339.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 151.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031096148304641247, "kl": 0.0004133201437070966, "learning_rate": 1.008888888888889e-06, "loss": 0.0, "num_tokens": 2441651.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 151.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 2.5590498447418213, "kl": 0.1549786850810051, "learning_rate": 1.0083333333333333e-06, "loss": -0.0047, "num_tokens": 2442002.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 8186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 151.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.108607769012451, "kl": 0.15527094062417746, "learning_rate": 1.0077777777777777e-06, "loss": 0.0042, "num_tokens": 2442333.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 151.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.2809200286865234, "kl": 0.12258255481719971, "learning_rate": 1.0072222222222223e-06, "loss": 0.0817, "num_tokens": 2442673.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 151.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.6572461128234863, "kl": 0.23176513984799385, "learning_rate": 1.0066666666666668e-06, "loss": 0.1487, "num_tokens": 2443048.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 8189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 151.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.01133536733686924, "kl": 0.0007886922103352845, "learning_rate": 1.0061111111111112e-06, "loss": 0.0, "num_tokens": 2443283.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 151.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 1.8376305103302002, "kl": 0.19902540370821953, "learning_rate": 1.0055555555555556e-06, "loss": 0.0119, "num_tokens": 2443561.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 151.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03542771935462952, "kl": 0.012145561631768942, "learning_rate": 1.0050000000000001e-06, "loss": 0.0006, "num_tokens": 2443822.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 151.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0464513786137104, "kl": 0.009842179249972105, "learning_rate": 1.0044444444444445e-06, "loss": 0.0006, "num_tokens": 2444076.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 151.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.026396965608000755, "kl": 0.0022903718054294586, "learning_rate": 1.003888888888889e-06, "loss": 0.0001, "num_tokens": 2444320.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 151.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.02452867291867733, "kl": 0.004026540671475232, "learning_rate": 1.0033333333333334e-06, "loss": 0.0002, "num_tokens": 2444602.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 151.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.023364445194602013, "kl": 0.022785136476159096, "learning_rate": 1.0027777777777778e-06, "loss": 0.0012, "num_tokens": 2444894.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 151.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.014708261936903, "kl": 0.00916608702391386, "learning_rate": 1.0022222222222223e-06, "loss": 0.0005, "num_tokens": 2445167.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 151.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.6643409729003906, "kl": 0.1366891786456108, "learning_rate": 1.0016666666666667e-06, "loss": 0.1352, "num_tokens": 2445504.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 151.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.038141828030347824, "kl": 0.17027391493320465, "learning_rate": 1.0011111111111112e-06, "loss": 0.0085, "num_tokens": 2445815.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 151.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 5.622564792633057, "kl": 0.05021194228902459, "learning_rate": 1.0005555555555556e-06, "loss": 0.0109, "num_tokens": 2446144.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 151.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.022979052737355232, "kl": 0.010816387832164764, "learning_rate": 1.0000000000000002e-06, "loss": 0.0005, "num_tokens": 2446456.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 151.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.008242185227572918, "kl": 0.0008976086683105677, "learning_rate": 9.994444444444445e-07, "loss": 0.0, "num_tokens": 2446716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 151.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 1.2587367296218872, "kl": 0.11567394249141216, "learning_rate": 9.988888888888889e-07, "loss": -0.1653, "num_tokens": 2447100.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 151.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.11154872179031372, "kl": 0.05156371742486954, "learning_rate": 9.983333333333334e-07, "loss": 0.0026, "num_tokens": 2447393.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 151.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.033982016146183014, "kl": 0.03017153311520815, "learning_rate": 9.97777777777778e-07, "loss": 0.0015, "num_tokens": 2447759.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 151.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02835831418633461, "kl": 0.06847169809043407, "learning_rate": 9.972222222222224e-07, "loss": 0.0032, "num_tokens": 2448129.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 151.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 10.486072540283203, "kl": 0.011796952225267887, "learning_rate": 9.966666666666667e-07, "loss": 0.2977, "num_tokens": 2448351.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 152.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.234218120574951, "kl": 0.3678881525993347, "learning_rate": 9.96111111111111e-07, "loss": 0.0137, "num_tokens": 2448655.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 8208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 152.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067313858307898045, "kl": 0.00588001124560833, "learning_rate": 9.955555555555556e-07, "loss": 0.0003, "num_tokens": 2448967.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 152.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.338589668273926, "kl": 1.2888412401080132, "learning_rate": 9.950000000000002e-07, "loss": 0.1226, "num_tokens": 2449229.0, "reward": 2.5, "reward_std": 3.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.0, "step": 8210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 152.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.007764013484120369, "kl": 0.0014606418553739786, "learning_rate": 9.944444444444446e-07, "loss": 0.0001, "num_tokens": 2449548.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 152.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.23476353287696838, "kl": 0.038024845998734236, "learning_rate": 9.93888888888889e-07, "loss": 0.0019, "num_tokens": 2449836.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.013054301030933857, "kl": 0.007953070104122162, "learning_rate": 9.933333333333333e-07, "loss": 0.0004, "num_tokens": 2450072.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 152.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.8816001415252686, "kl": 0.2630881192162633, "learning_rate": 9.927777777777778e-07, "loss": 0.0143, "num_tokens": 2450377.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 8214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 152.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.002605794696137309, "kl": 0.28221240639686584, "learning_rate": 9.922222222222224e-07, "loss": 0.0141, "num_tokens": 2450665.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 152.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.21193814277648926, "kl": 0.11948662996292114, "learning_rate": 9.916666666666668e-07, "loss": 0.0061, "num_tokens": 2451033.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 152.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03178351745009422, "kl": 0.02848426764830947, "learning_rate": 9.911111111111111e-07, "loss": 0.0016, "num_tokens": 2451305.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 152.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08177682012319565, "kl": 0.03714816831052303, "learning_rate": 9.905555555555555e-07, "loss": 0.002, "num_tokens": 2451660.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 152.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009436577092856169, "kl": 0.0005122131842654198, "learning_rate": 9.9e-07, "loss": 0.0, "num_tokens": 2451974.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 152.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03137631341814995, "kl": 0.4379049837589264, "learning_rate": 9.894444444444446e-07, "loss": 0.0219, "num_tokens": 2452258.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.014488914050161839, "kl": 0.02741071581840515, "learning_rate": 9.88888888888889e-07, "loss": 0.0014, "num_tokens": 2452474.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 152.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.22229328751564026, "kl": 0.037908364087343216, "learning_rate": 9.883333333333333e-07, "loss": 0.0019, "num_tokens": 2452771.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 152.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.32105928659439087, "kl": 0.04324895440367982, "learning_rate": 9.87777777777778e-07, "loss": 0.002, "num_tokens": 2453039.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 152.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.932542324066162, "kl": 0.19260598719120026, "learning_rate": 9.872222222222223e-07, "loss": -0.1226, "num_tokens": 2453365.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 152.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.8480288982391357, "kl": 0.15899479761719704, "learning_rate": 9.866666666666668e-07, "loss": 0.1479, "num_tokens": 2453727.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 8225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 152.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02855774015188217, "kl": 0.01248637493699789, "learning_rate": 9.861111111111112e-07, "loss": 0.0006, "num_tokens": 2453987.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.021753400564193726, "kl": 0.0028998898342251778, "learning_rate": 9.855555555555555e-07, "loss": 0.0001, "num_tokens": 2454289.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 152.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.42692801356315613, "kl": 0.17295902222394943, "learning_rate": 9.85e-07, "loss": 0.0085, "num_tokens": 2454598.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 152.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006772028747946024, "kl": 0.17442352324724197, "learning_rate": 9.844444444444445e-07, "loss": 0.0087, "num_tokens": 2454906.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 152.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04253422096371651, "kl": 0.0010010600090026855, "learning_rate": 9.83888888888889e-07, "loss": 0.0001, "num_tokens": 2455110.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 152.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.604266405105591, "kl": 0.015419248258695006, "learning_rate": 9.833333333333334e-07, "loss": 0.1185, "num_tokens": 2455386.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 152.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.04756354168057442, "kl": 0.002690201858058572, "learning_rate": 9.82777777777778e-07, "loss": 0.0001, "num_tokens": 2455629.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 152.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.008512798696756363, "kl": 0.00021697879128623754, "learning_rate": 9.822222222222223e-07, "loss": 0.0, "num_tokens": 2455885.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 152.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.11054234951734543, "kl": 0.0506095290184021, "learning_rate": 9.816666666666667e-07, "loss": 0.0026, "num_tokens": 2456186.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03893882781267166, "kl": 0.011487807147204876, "learning_rate": 9.811111111111112e-07, "loss": 0.0006, "num_tokens": 2456462.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.021771378815174103, "kl": 0.0004122108221054077, "learning_rate": 9.805555555555558e-07, "loss": 0.0, "num_tokens": 2456674.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 90.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 90.0, "completions/mean_terminated_length": 34.66666793823242, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 152.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.9548532962799072, "kl": 0.11021560057997704, "learning_rate": 9.800000000000001e-07, "loss": 0.5337, "num_tokens": 2457258.0, "reward": 4.0, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 4.041451930999756, "step": 8237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 152.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03682565689086914, "kl": 0.026065649464726448, "learning_rate": 9.794444444444445e-07, "loss": 0.0013, "num_tokens": 2457604.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 152.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04172550141811371, "kl": 0.011209496296942234, "learning_rate": 9.788888888888889e-07, "loss": 0.0006, "num_tokens": 2457935.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 152.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04400644451379776, "kl": 0.009475438855588436, "learning_rate": 9.783333333333334e-07, "loss": 0.0005, "num_tokens": 2458208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 152.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.023080218583345413, "kl": 0.0037068736273795366, "learning_rate": 9.77777777777778e-07, "loss": 0.0002, "num_tokens": 2458487.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 4.4556396460393444e-05, "kl": 4.597008228302002e-06, "learning_rate": 9.772222222222223e-07, "loss": 0.0, "num_tokens": 2458707.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 152.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.29216164350509644, "kl": 0.07593680592253804, "learning_rate": 9.766666666666667e-07, "loss": 0.0037, "num_tokens": 2459029.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 152.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.014537781476974487, "kl": 0.0011678061564452946, "learning_rate": 9.76111111111111e-07, "loss": 0.0001, "num_tokens": 2459297.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.11721856147050858, "kl": 0.005795794713776559, "learning_rate": 9.755555555555556e-07, "loss": 0.0003, "num_tokens": 2459514.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 152.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.007133961655199528, "kl": 0.0008070170879364014, "learning_rate": 9.750000000000002e-07, "loss": 0.0, "num_tokens": 2459726.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.5, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 152.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.2815612256526947, "kl": 0.10405945032835007, "learning_rate": 9.744444444444445e-07, "loss": 0.0052, "num_tokens": 2460164.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 152.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.028884461149573326, "kl": 0.0032173432409763336, "learning_rate": 9.73888888888889e-07, "loss": 0.0002, "num_tokens": 2460416.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 152.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.030192283913493156, "kl": 0.04407590627670288, "learning_rate": 9.733333333333333e-07, "loss": 0.0022, "num_tokens": 2460884.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 152.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06040835380554199, "kl": 0.053555406630039215, "learning_rate": 9.727777777777778e-07, "loss": 0.0023, "num_tokens": 2461211.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 152.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03256450593471527, "kl": 0.010617740452289581, "learning_rate": 9.722222222222224e-07, "loss": 0.0005, "num_tokens": 2461545.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 152.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05199417099356651, "kl": 0.002025383757427335, "learning_rate": 9.716666666666668e-07, "loss": 0.0001, "num_tokens": 2461793.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 152.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06755519658327103, "kl": 0.016852103173732758, "learning_rate": 9.711111111111111e-07, "loss": 0.0009, "num_tokens": 2462063.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 152.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.6125407218933105, "kl": 0.12238950654864311, "learning_rate": 9.705555555555557e-07, "loss": 0.0869, "num_tokens": 2462406.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 152.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.6290031671524048, "kl": 0.07698490470647812, "learning_rate": 9.7e-07, "loss": 0.0042, "num_tokens": 2462704.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 3.0255959033966064, "kl": 0.11801899410784245, "learning_rate": 9.694444444444446e-07, "loss": 0.2686, "num_tokens": 2463020.0, "reward": 7.300000190734863, "reward_std": 0.40000009536743164, "rewards/reward_combined/mean": 7.300000190734863, "rewards/reward_combined/std": 0.40000009536743164, "step": 8256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 152.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.020949097350239754, "kl": 0.001057433255482465, "learning_rate": 9.68888888888889e-07, "loss": 0.0001, "num_tokens": 2463255.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 152.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.007345511112362146, "kl": 0.23878645151853561, "learning_rate": 9.683333333333333e-07, "loss": 0.0119, "num_tokens": 2463555.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 7.842862606048584, "kl": 0.051941774785518646, "learning_rate": 9.677777777777779e-07, "loss": 0.0391, "num_tokens": 2463847.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 152.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02282148040831089, "kl": 0.004666342167183757, "learning_rate": 9.672222222222222e-07, "loss": 0.0002, "num_tokens": 2464131.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0016666667070239782, "clip_ratio/low_min": 0.0016666667070239782, "clip_ratio/region_mean": 0.0016666667070239782, "completion_length": 95.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 95.0, "completions/mean_terminated_length": 41.333335876464844, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 152.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.6182349920272827, "kl": 0.10108280926942825, "learning_rate": 9.666666666666668e-07, "loss": 0.0377, "num_tokens": 2464763.0, "reward": 4.300000190734863, "reward_std": 4.284857273101807, "rewards/reward_combined/mean": 4.300000190734863, "rewards/reward_combined/std": 4.284857273101807, "step": 8261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 153.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.12186551094055176, "kl": 0.02647504024207592, "learning_rate": 9.661111111111112e-07, "loss": 0.0013, "num_tokens": 2465096.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 153.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.0036673545837402, "kl": 0.1380043774843216, "learning_rate": 9.655555555555557e-07, "loss": -0.0284, "num_tokens": 2465458.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 153.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.027236195281147957, "kl": 0.012697031255811453, "learning_rate": 9.65e-07, "loss": 0.0006, "num_tokens": 2465718.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 153.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.057199254631996155, "kl": 0.17586375027894974, "learning_rate": 9.644444444444444e-07, "loss": 0.0088, "num_tokens": 2466002.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 153.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.01636054739356041, "kl": 0.0036018043756484985, "learning_rate": 9.63888888888889e-07, "loss": 0.0002, "num_tokens": 2466262.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 153.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.003658340312540531, "kl": 0.00043678880319930613, "learning_rate": 9.633333333333336e-07, "loss": 0.0, "num_tokens": 2466482.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 153.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04478210583329201, "kl": 0.09021195024251938, "learning_rate": 9.62777777777778e-07, "loss": 0.0042, "num_tokens": 2466897.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 153.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.009182375855743885, "kl": 0.0038805862423032522, "learning_rate": 9.622222222222223e-07, "loss": 0.0002, "num_tokens": 2467161.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 153.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.033710237592458725, "kl": 0.004330638563260436, "learning_rate": 9.616666666666666e-07, "loss": 0.0002, "num_tokens": 2467461.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 153.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.856102705001831, "kl": 0.13928833976387978, "learning_rate": 9.611111111111112e-07, "loss": 0.0005, "num_tokens": 2467830.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 153.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024516680277884007, "kl": 0.28225037455558777, "learning_rate": 9.605555555555558e-07, "loss": 0.0141, "num_tokens": 2468118.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 153.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04780052974820137, "kl": 0.002663895531441085, "learning_rate": 9.600000000000001e-07, "loss": 0.0001, "num_tokens": 2468374.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 153.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.037041909992694855, "kl": 0.004753183806315064, "learning_rate": 9.594444444444445e-07, "loss": 0.0002, "num_tokens": 2468658.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 153.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.038940176367759705, "kl": 0.011779502965509892, "learning_rate": 9.588888888888888e-07, "loss": 0.0006, "num_tokens": 2468934.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 153.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.03011169843375683, "kl": 0.0015916085103526711, "learning_rate": 9.583333333333334e-07, "loss": 0.0001, "num_tokens": 2469177.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 153.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02563072182238102, "kl": 0.007875108160078526, "learning_rate": 9.57777777777778e-07, "loss": 0.0004, "num_tokens": 2469450.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 153.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 5.937426567077637, "kl": 0.8296699374914169, "learning_rate": 9.572222222222223e-07, "loss": 0.0259, "num_tokens": 2469733.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 153.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.7504336833953857, "kl": 0.04381680674850941, "learning_rate": 9.566666666666667e-07, "loss": -0.0176, "num_tokens": 2470046.0, "reward": 5.25, "reward_std": 3.3040380477905273, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 3.3040380477905273, "step": 8279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 153.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.8881515860557556, "kl": 0.1320575401186943, "learning_rate": 9.56111111111111e-07, "loss": -0.1023, "num_tokens": 2470492.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 8280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 153.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.012453915551304817, "kl": 0.008050315082073212, "learning_rate": 9.555555555555556e-07, "loss": 0.0004, "num_tokens": 2470728.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 153.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.003128413110971451, "kl": 0.00562521256506443, "learning_rate": 9.550000000000002e-07, "loss": 0.0003, "num_tokens": 2471040.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 153.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.05545457452535629, "kl": 0.13920355588197708, "learning_rate": 9.544444444444445e-07, "loss": 0.007, "num_tokens": 2471369.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 153.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.1999225914478302, "kl": 0.19883739948272705, "learning_rate": 9.538888888888889e-07, "loss": 0.01, "num_tokens": 2471681.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 153.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08551445603370667, "kl": 0.015486528165638447, "learning_rate": 9.533333333333335e-07, "loss": 0.0008, "num_tokens": 2472015.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.75, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 153.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.743464231491089, "kl": 0.04661625996232033, "learning_rate": 9.527777777777779e-07, "loss": 0.1929, "num_tokens": 2472414.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 153.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.018636181950569153, "kl": 0.005179662257432938, "learning_rate": 9.522222222222223e-07, "loss": 0.0003, "num_tokens": 2472740.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 153.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 3.9281094359466806e-05, "kl": 4.9173831939697266e-06, "learning_rate": 9.516666666666667e-07, "loss": 0.0, "num_tokens": 2472960.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 153.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.023418180644512177, "kl": 0.0016410013195127249, "learning_rate": 9.511111111111111e-07, "loss": 0.0001, "num_tokens": 2473223.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 153.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.6565191745758057, "kl": 0.10021274443715811, "learning_rate": 9.505555555555557e-07, "loss": 0.006, "num_tokens": 2473552.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 153.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.16534766554832458, "kl": 0.07653161510825157, "learning_rate": 9.500000000000001e-07, "loss": 0.0038, "num_tokens": 2473840.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 153.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.020233100280165672, "kl": 0.0072430099826306105, "learning_rate": 9.494444444444445e-07, "loss": 0.0004, "num_tokens": 2474145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 153.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.951833963394165, "kl": 0.08675306662917137, "learning_rate": 9.488888888888889e-07, "loss": 0.0783, "num_tokens": 2474454.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 153.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 7.298069000244141, "kl": 0.04556289501488209, "learning_rate": 9.483333333333335e-07, "loss": 0.0244, "num_tokens": 2474727.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 153.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.006797728128731251, "kl": 0.0007704070303589106, "learning_rate": 9.477777777777779e-07, "loss": 0.0, "num_tokens": 2474961.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 153.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.006605051923543215, "kl": 0.22628721594810486, "learning_rate": 9.472222222222223e-07, "loss": 0.0113, "num_tokens": 2475263.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 153.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04089266061782837, "kl": 0.0032128699822351336, "learning_rate": 9.466666666666667e-07, "loss": 0.0002, "num_tokens": 2475519.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 153.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.061719030141830444, "kl": 0.026013270020484924, "learning_rate": 9.461111111111112e-07, "loss": 0.0013, "num_tokens": 2475746.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 153.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06792201101779938, "kl": 0.0039111756486818194, "learning_rate": 9.455555555555557e-07, "loss": 0.0002, "num_tokens": 2476042.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 153.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.021750284358859062, "kl": 0.0034491121186874807, "learning_rate": 9.450000000000001e-07, "loss": 0.0002, "num_tokens": 2476324.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 153.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.013388005085289478, "kl": 0.004657013894757256, "learning_rate": 9.444444444444445e-07, "loss": 0.0003, "num_tokens": 2476578.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 153.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 4.9824066162109375, "kl": 0.10626695305109024, "learning_rate": 9.438888888888889e-07, "loss": 0.0292, "num_tokens": 2476932.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 153.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.020679347217082977, "kl": 0.009751351783052087, "learning_rate": 9.433333333333334e-07, "loss": 0.0005, "num_tokens": 2477218.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 153.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03357289358973503, "kl": 0.0031597688794136047, "learning_rate": 9.427777777777779e-07, "loss": 0.0002, "num_tokens": 2477426.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 153.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03579653054475784, "kl": 0.0020621120929718018, "learning_rate": 9.422222222222223e-07, "loss": 0.0001, "num_tokens": 2477638.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 153.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03676250949501991, "kl": 0.043708053417503834, "learning_rate": 9.416666666666667e-07, "loss": 0.0022, "num_tokens": 2477913.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 153.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.04659417271614075, "kl": 0.018662646878510714, "learning_rate": 9.411111111111113e-07, "loss": 0.0009, "num_tokens": 2478270.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 153.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01201381254941225, "kl": 0.0008304885705001652, "learning_rate": 9.405555555555556e-07, "loss": 0.0, "num_tokens": 2478558.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 153.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.019333740696310997, "kl": 0.0004040449857711792, "learning_rate": 9.400000000000001e-07, "loss": 0.0, "num_tokens": 2478770.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 153.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034411591477692127, "kl": 0.0013380356249399483, "learning_rate": 9.394444444444445e-07, "loss": 0.0001, "num_tokens": 2479089.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 153.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.1703898310661316, "kl": 0.11844128370285034, "learning_rate": 9.388888888888889e-07, "loss": 0.0059, "num_tokens": 2479496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 153.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.01984596811234951, "kl": 0.0024371393665205687, "learning_rate": 9.383333333333335e-07, "loss": 0.0001, "num_tokens": 2479809.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 153.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 5.908341407775879, "kl": 0.31412844359874725, "learning_rate": 9.377777777777778e-07, "loss": -0.2458, "num_tokens": 2480125.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 8313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 153.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.040607988834381104, "kl": 0.12789736920967698, "learning_rate": 9.372222222222223e-07, "loss": 0.0075, "num_tokens": 2480448.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 153.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.11634599417448044, "kl": 0.11504924669861794, "learning_rate": 9.366666666666667e-07, "loss": 0.0057, "num_tokens": 2480754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 154.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08588802814483643, "kl": 0.031782953068614006, "learning_rate": 9.361111111111112e-07, "loss": 0.0016, "num_tokens": 2481051.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 154.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.010207870043814182, "kl": 0.0015519476728513837, "learning_rate": 9.355555555555557e-07, "loss": 0.0001, "num_tokens": 2481369.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 5.137205123901367, "kl": 0.016152539290487766, "learning_rate": 9.35e-07, "loss": -0.018, "num_tokens": 2481643.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 154.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.06385917961597443, "kl": 0.28538307547569275, "learning_rate": 9.344444444444445e-07, "loss": 0.0143, "num_tokens": 2481931.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 154.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.003954765852540731, "kl": 0.0005652245599776506, "learning_rate": 9.338888888888889e-07, "loss": 0.0, "num_tokens": 2482245.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 154.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02163071744143963, "kl": 0.0004205256700515747, "learning_rate": 9.333333333333334e-07, "loss": 0.0, "num_tokens": 2482458.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 154.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01696672849357128, "kl": 0.001633541309274733, "learning_rate": 9.327777777777779e-07, "loss": 0.0001, "num_tokens": 2482677.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.09376358985900879, "kl": 0.029560466995462775, "learning_rate": 9.322222222222222e-07, "loss": 0.0016, "num_tokens": 2482975.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 154.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.11275040358304977, "kl": 0.016000008676201105, "learning_rate": 9.316666666666667e-07, "loss": 0.0008, "num_tokens": 2483306.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 154.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.007233601529151201, "kl": 0.0008807480335235596, "learning_rate": 9.311111111111113e-07, "loss": 0.0, "num_tokens": 2483518.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.002649082103744149, "kl": 0.0019323137239553034, "learning_rate": 9.305555555555556e-07, "loss": 0.0001, "num_tokens": 2483795.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 154.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.001187926041893661, "kl": 0.00011153519153594971, "learning_rate": 9.300000000000001e-07, "loss": 0.0, "num_tokens": 2484051.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 154.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 1.252397060394287, "kl": 0.01030015666037798, "learning_rate": 9.294444444444445e-07, "loss": 0.4419, "num_tokens": 2484583.0, "reward": 5.300000190734863, "reward_std": 4.399999618530273, "rewards/reward_combined/mean": 5.300000190734863, "rewards/reward_combined/std": 4.400000095367432, "step": 8328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 154.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04725639894604683, "kl": 0.08859854936599731, "learning_rate": 9.288888888888889e-07, "loss": 0.0042, "num_tokens": 2485004.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 154.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.09279268234968185, "kl": 0.037766383960843086, "learning_rate": 9.283333333333335e-07, "loss": 0.002, "num_tokens": 2485250.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 154.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.027816716581583023, "kl": 0.006652844720520079, "learning_rate": 9.277777777777778e-07, "loss": 0.0003, "num_tokens": 2485575.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 154.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.818727731704712, "kl": 0.04398629255592823, "learning_rate": 9.272222222222223e-07, "loss": 0.0065, "num_tokens": 2485937.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 154.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.009965533390641212, "kl": 0.25134654343128204, "learning_rate": 9.266666666666667e-07, "loss": 0.0125, "num_tokens": 2486235.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 154.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 3.8526859283447266, "kl": 0.1846371239516884, "learning_rate": 9.261111111111112e-07, "loss": 0.007, "num_tokens": 2486511.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 8334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 154.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.3879215717315674, "kl": 0.18181029707193375, "learning_rate": 9.255555555555557e-07, "loss": 0.0455, "num_tokens": 2486856.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 154.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.0777792930603027, "kl": 0.2687152922153473, "learning_rate": 9.25e-07, "loss": 0.0811, "num_tokens": 2487224.0, "reward": 6.625, "reward_std": 2.0966243743896484, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.0966243743896484, "step": 8336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 154.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0501963272690773, "kl": 0.006814445718191564, "learning_rate": 9.244444444444445e-07, "loss": 0.0004, "num_tokens": 2487484.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 154.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.023462044075131416, "kl": 0.15024803578853607, "learning_rate": 9.238888888888891e-07, "loss": 0.0076, "num_tokens": 2487792.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 154.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.578113555908203, "kl": 0.06913801282644272, "learning_rate": 9.233333333333334e-07, "loss": -0.11, "num_tokens": 2488161.0, "reward": 6.625, "reward_std": 2.428133726119995, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 2.428133726119995, "step": 8339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 154.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.3085546493530273, "kl": 0.01586135569959879, "learning_rate": 9.227777777777779e-07, "loss": -0.0177, "num_tokens": 2488479.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 154.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.021798500791192055, "kl": 0.0011989488266408443, "learning_rate": 9.222222222222222e-07, "loss": 0.0001, "num_tokens": 2488714.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03184311464428902, "kl": 0.004544412717223167, "learning_rate": 9.216666666666667e-07, "loss": 0.0002, "num_tokens": 2488996.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 154.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02491951733827591, "kl": 0.013506738752766978, "learning_rate": 9.211111111111113e-07, "loss": 0.0007, "num_tokens": 2489284.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 154.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.3063126802444458, "kl": 0.03375224396586418, "learning_rate": 9.205555555555556e-07, "loss": 0.0018, "num_tokens": 2489621.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 154.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05121413245797157, "kl": 0.033615912310779095, "learning_rate": 9.200000000000001e-07, "loss": 0.0018, "num_tokens": 2489976.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 154.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 3.615007153712213e-05, "kl": 4.447996616363525e-06, "learning_rate": 9.194444444444444e-07, "loss": 0.0, "num_tokens": 2490196.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 154.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.6390206813812256, "kl": 0.1956336349248886, "learning_rate": 9.18888888888889e-07, "loss": 0.0041, "num_tokens": 2490456.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 154.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 3.4119365215301514, "kl": 0.07747355103492737, "learning_rate": 9.183333333333335e-07, "loss": -0.0797, "num_tokens": 2490771.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 154.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04108252376317978, "kl": 0.00769917294383049, "learning_rate": 9.177777777777778e-07, "loss": 0.0004, "num_tokens": 2491064.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 154.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.14465679228305817, "kl": 0.026771453209221363, "learning_rate": 9.172222222222223e-07, "loss": 0.0013, "num_tokens": 2491311.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 154.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.013833453878760338, "kl": 0.01050372701138258, "learning_rate": 9.166666666666666e-07, "loss": 0.0005, "num_tokens": 2491572.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 154.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.04062715917825699, "kl": 0.0036940262652933598, "learning_rate": 9.161111111111112e-07, "loss": 0.0002, "num_tokens": 2491872.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 154.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.031730737537145615, "kl": 0.009587745182216167, "learning_rate": 9.155555555555557e-07, "loss": 0.0005, "num_tokens": 2492152.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03707212209701538, "kl": 0.06216977909207344, "learning_rate": 9.15e-07, "loss": 0.0033, "num_tokens": 2492432.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 154.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.2909958064556122, "kl": 0.05664581432938576, "learning_rate": 9.144444444444445e-07, "loss": 0.0033, "num_tokens": 2492702.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 154.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.01114567182958126, "kl": 0.0006502115866169333, "learning_rate": 9.138888888888891e-07, "loss": 0.0, "num_tokens": 2492963.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 154.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.09315097332000732, "kl": 0.17853690683841705, "learning_rate": 9.133333333333334e-07, "loss": 0.009, "num_tokens": 2493255.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 154.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01084123831242323, "kl": 0.008240901865065098, "learning_rate": 9.127777777777779e-07, "loss": 0.0004, "num_tokens": 2493567.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 154.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.13067644834518433, "kl": 0.12974867224693298, "learning_rate": 9.122222222222222e-07, "loss": 0.0065, "num_tokens": 2493904.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 154.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.671277046203613, "kl": 0.8786428272724152, "learning_rate": 9.116666666666667e-07, "loss": 0.2152, "num_tokens": 2494202.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 154.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.04173637926578522, "kl": 0.0049191436264663935, "learning_rate": 9.111111111111113e-07, "loss": 0.0002, "num_tokens": 2494474.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 154.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031350692734122276, "kl": 0.00023854971368564293, "learning_rate": 9.105555555555556e-07, "loss": 0.0, "num_tokens": 2494734.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 154.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03897099196910858, "kl": 0.045018908102065325, "learning_rate": 9.100000000000001e-07, "loss": 0.0023, "num_tokens": 2495024.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 154.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.027980871498584747, "kl": 0.01656325114890933, "learning_rate": 9.094444444444444e-07, "loss": 0.0008, "num_tokens": 2495361.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 154.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 0.6939200758934021, "kl": 0.21767177432775497, "learning_rate": 9.08888888888889e-07, "loss": -0.0328, "num_tokens": 2495829.0, "reward": 2.799999952316284, "reward_std": 0.4000000059604645, "rewards/reward_combined/mean": 2.799999952316284, "rewards/reward_combined/std": 0.4000000059604645, "step": 8365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 154.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.012232051230967045, "kl": 0.00821755826473236, "learning_rate": 9.083333333333335e-07, "loss": 0.0004, "num_tokens": 2496065.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 154.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0238717682659626, "kl": 0.0713818408548832, "learning_rate": 9.077777777777778e-07, "loss": 0.0036, "num_tokens": 2496427.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009433962404727936, "clip_ratio/low_min": 0.009433962404727936, "clip_ratio/region_mean": 0.009433962404727936, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 154.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.8453149795532227, "kl": 0.1385357677936554, "learning_rate": 9.072222222222223e-07, "loss": -0.0248, "num_tokens": 2496741.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 154.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.040253326296806335, "kl": 0.0035328641533851624, "learning_rate": 9.066666666666668e-07, "loss": 0.0002, "num_tokens": 2496949.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 155.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.372709184885025, "kl": 0.08630516193807125, "learning_rate": 9.061111111111112e-07, "loss": 0.0043, "num_tokens": 2497248.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 155.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0325293205678463, "kl": 0.0024698544293642044, "learning_rate": 9.055555555555557e-07, "loss": 0.0001, "num_tokens": 2497508.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.017273100093007088, "kl": 0.022936068009585142, "learning_rate": 9.05e-07, "loss": 0.0012, "num_tokens": 2497798.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 155.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0708007887005806, "kl": 0.02536606602370739, "learning_rate": 9.044444444444445e-07, "loss": 0.0012, "num_tokens": 2498097.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 155.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.3671231269836426, "kl": 0.0691005028784275, "learning_rate": 9.03888888888889e-07, "loss": 0.0172, "num_tokens": 2498495.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.06338417530059814, "kl": 0.0074039127212017775, "learning_rate": 9.033333333333334e-07, "loss": 0.0004, "num_tokens": 2498781.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 155.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.010506738908588886, "kl": 0.0010606225114315748, "learning_rate": 9.027777777777779e-07, "loss": 0.0001, "num_tokens": 2499046.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 155.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.481752872467041, "kl": 0.10610418021678925, "learning_rate": 9.022222222222222e-07, "loss": 0.0192, "num_tokens": 2499359.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 155.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.004853736609220505, "kl": 0.0004749968647956848, "learning_rate": 9.016666666666668e-07, "loss": 0.0, "num_tokens": 2499603.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03133650869131088, "kl": 0.008955324534326792, "learning_rate": 9.011111111111112e-07, "loss": 0.0004, "num_tokens": 2499883.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 155.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.060444410890340805, "kl": 0.02446411456912756, "learning_rate": 9.005555555555556e-07, "loss": 0.0012, "num_tokens": 2500213.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 155.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.012068552896380424, "kl": 0.02737893909215927, "learning_rate": 9.000000000000001e-07, "loss": 0.0014, "num_tokens": 2500429.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 155.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.497473955154419, "kl": 0.09171618521213531, "learning_rate": 8.994444444444444e-07, "loss": 0.0545, "num_tokens": 2500737.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 155.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.005428800359368324, "kl": 0.000617049285210669, "learning_rate": 8.98888888888889e-07, "loss": 0.0, "num_tokens": 2500972.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 155.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05336778983473778, "kl": 0.014540864620357752, "learning_rate": 8.983333333333334e-07, "loss": 0.0007, "num_tokens": 2501242.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 155.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.00963960774242878, "kl": 0.0007224874279927462, "learning_rate": 8.977777777777778e-07, "loss": 0.0, "num_tokens": 2501558.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 155.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.009757086634635925, "kl": 0.004333636781666428, "learning_rate": 8.972222222222223e-07, "loss": 0.0002, "num_tokens": 2501846.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 155.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.03624284267425537, "kl": 0.0008025467395782471, "learning_rate": 8.966666666666668e-07, "loss": 0.0, "num_tokens": 2502050.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 155.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.003985479474067688, "kl": 0.0004716753901448101, "learning_rate": 8.961111111111112e-07, "loss": 0.0, "num_tokens": 2502270.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 155.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07243915647268295, "kl": 0.1721273809671402, "learning_rate": 8.955555555555557e-07, "loss": 0.0086, "num_tokens": 2502554.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 155.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.032524388283491135, "kl": 0.024467193987220526, "learning_rate": 8.95e-07, "loss": 0.0013, "num_tokens": 2502825.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 155.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 3.4769782359944656e-05, "kl": 4.112720489501953e-06, "learning_rate": 8.944444444444445e-07, "loss": 0.0, "num_tokens": 2503045.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 155.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.956838846206665, "kl": 0.08925355412065983, "learning_rate": 8.93888888888889e-07, "loss": 0.0333, "num_tokens": 2503406.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 155.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.009756245650351048, "kl": 0.0737454816699028, "learning_rate": 8.933333333333334e-07, "loss": 0.0036, "num_tokens": 2503842.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 155.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.024798234924674034, "kl": 0.009319179924204946, "learning_rate": 8.927777777777779e-07, "loss": 0.0005, "num_tokens": 2504144.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 155.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.108846664428711, "kl": 0.3819397762417793, "learning_rate": 8.922222222222222e-07, "loss": -0.0041, "num_tokens": 2504504.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 155.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.6822381019592285, "kl": 0.34627243876457214, "learning_rate": 8.916666666666668e-07, "loss": -0.027, "num_tokens": 2504880.0, "reward": 5.125, "reward_std": 5.421792507171631, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 5.421792507171631, "step": 8396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 155.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.03248096629977226, "kl": 0.004179265350103378, "learning_rate": 8.911111111111112e-07, "loss": 0.0002, "num_tokens": 2505140.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 155.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10696563869714737, "kl": 0.052198464050889015, "learning_rate": 8.905555555555556e-07, "loss": 0.0029, "num_tokens": 2505442.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 155.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.7936105728149414, "kl": 0.02725023590028286, "learning_rate": 8.900000000000001e-07, "loss": 0.0016, "num_tokens": 2505765.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 155.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.01744229346513748, "kl": 0.23926900327205658, "learning_rate": 8.894444444444446e-07, "loss": 0.0119, "num_tokens": 2506065.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 155.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 0.8259564638137817, "kl": 0.12661323696374893, "learning_rate": 8.88888888888889e-07, "loss": -0.0489, "num_tokens": 2506523.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 8401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 155.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.26221567392349243, "kl": 0.0601100604981184, "learning_rate": 8.883333333333334e-07, "loss": 0.0032, "num_tokens": 2506821.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 155.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.024154087528586388, "kl": 0.0033858882961794734, "learning_rate": 8.877777777777778e-07, "loss": 0.0002, "num_tokens": 2507087.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 155.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.030493810772895813, "kl": 0.006721431796904653, "learning_rate": 8.872222222222223e-07, "loss": 0.0004, "num_tokens": 2507411.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 155.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011983796954154968, "kl": 0.008212104439735413, "learning_rate": 8.866666666666668e-07, "loss": 0.0004, "num_tokens": 2507647.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 96.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 43.333335876464844, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 155.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.4750492572784424, "kl": 0.08594951406121254, "learning_rate": 8.861111111111112e-07, "loss": 0.5169, "num_tokens": 2508257.0, "reward": 3.299999952316284, "reward_std": 5.770037651062012, "rewards/reward_combined/mean": 3.299999952316284, "rewards/reward_combined/std": 5.770037651062012, "step": 8406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 155.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03015509620308876, "kl": 0.02869994379580021, "learning_rate": 8.855555555555556e-07, "loss": 0.0014, "num_tokens": 2508643.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 155.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 3.4553754329681396, "kl": 1.3602250665426254, "learning_rate": 8.85e-07, "loss": -0.0401, "num_tokens": 2508922.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 155.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.01458907499909401, "kl": 0.00039880871190689504, "learning_rate": 8.844444444444446e-07, "loss": 0.0, "num_tokens": 2509178.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 155.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.025631200522184372, "kl": 0.012939356733113527, "learning_rate": 8.83888888888889e-07, "loss": 0.0006, "num_tokens": 2509438.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 155.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.01279708556830883, "kl": 0.009746909141540527, "learning_rate": 8.833333333333334e-07, "loss": 0.0005, "num_tokens": 2509710.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 155.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.002711341017857194, "kl": 0.2823397219181061, "learning_rate": 8.827777777777778e-07, "loss": 0.0141, "num_tokens": 2509998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 155.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.34945231676101685, "kl": 0.053152160719037056, "learning_rate": 8.822222222222222e-07, "loss": 0.0035, "num_tokens": 2510271.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.005642474628984928, "kl": 0.0028907503001391888, "learning_rate": 8.816666666666668e-07, "loss": 0.0001, "num_tokens": 2510575.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 155.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.015319679863750935, "kl": 0.00041645317105576396, "learning_rate": 8.811111111111112e-07, "loss": 0.0, "num_tokens": 2510829.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 155.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.018405528739094734, "kl": 0.006732518377248198, "learning_rate": 8.805555555555556e-07, "loss": 0.0003, "num_tokens": 2511117.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 155.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 6.934492588043213, "kl": 0.39681610465049744, "learning_rate": 8.8e-07, "loss": 0.1174, "num_tokens": 2511435.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 155.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 10.148682594299316, "kl": 0.06298206746578217, "learning_rate": 8.794444444444446e-07, "loss": 0.2352, "num_tokens": 2511654.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 155.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.031192952767014503, "kl": 0.0038375144358724356, "learning_rate": 8.78888888888889e-07, "loss": 0.0002, "num_tokens": 2511935.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 155.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.048388440161943436, "kl": 0.06904281675815582, "learning_rate": 8.783333333333334e-07, "loss": 0.0034, "num_tokens": 2512283.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 155.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.732748508453369, "kl": 0.01925431052222848, "learning_rate": 8.777777777777778e-07, "loss": 0.0313, "num_tokens": 2512608.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 155.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.09530682861804962, "kl": 0.007524853106588125, "learning_rate": 8.772222222222222e-07, "loss": 0.0005, "num_tokens": 2512824.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 155.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09054383635520935, "kl": 0.020260409452021122, "learning_rate": 8.766666666666668e-07, "loss": 0.001, "num_tokens": 2513152.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 156.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.010778219439089298, "kl": 0.008209151215851307, "learning_rate": 8.761111111111112e-07, "loss": 0.0004, "num_tokens": 2513464.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 156.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03050384856760502, "kl": 0.0024588212836533785, "learning_rate": 8.755555555555556e-07, "loss": 0.0001, "num_tokens": 2513718.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 156.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.013977598398923874, "kl": 0.02778041362762451, "learning_rate": 8.75e-07, "loss": 0.0014, "num_tokens": 2513934.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 156.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.05636969581246376, "kl": 0.0741523914039135, "learning_rate": 8.744444444444446e-07, "loss": 0.0037, "num_tokens": 2514293.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 156.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.01358525175601244, "kl": 0.03103153593838215, "learning_rate": 8.73888888888889e-07, "loss": 0.0016, "num_tokens": 2514587.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 156.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.025979014113545418, "kl": 0.002243857248686254, "learning_rate": 8.733333333333334e-07, "loss": 0.0001, "num_tokens": 2514845.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 156.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.040831323713064194, "kl": 0.005403248127549887, "learning_rate": 8.727777777777778e-07, "loss": 0.0003, "num_tokens": 2515147.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 156.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.225511074066162, "kl": 0.0769658088684082, "learning_rate": 8.722222222222224e-07, "loss": 0.38, "num_tokens": 2515522.0, "reward": 7.800000190734863, "reward_std": 0.40000009536743164, "rewards/reward_combined/mean": 7.800000190734863, "rewards/reward_combined/std": 0.40000009536743164, "step": 8431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 156.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.040157757699489594, "kl": 0.01448101457208395, "learning_rate": 8.716666666666668e-07, "loss": 0.0007, "num_tokens": 2515802.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 156.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.05510338395833969, "kl": 0.0052604079246521, "learning_rate": 8.711111111111112e-07, "loss": 0.0003, "num_tokens": 2516046.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 156.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06713011860847473, "kl": 0.0021996796131134033, "learning_rate": 8.705555555555556e-07, "loss": 0.0001, "num_tokens": 2516258.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 156.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09763708710670471, "kl": 0.015349369030445814, "learning_rate": 8.7e-07, "loss": 0.0008, "num_tokens": 2516540.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 156.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04510578513145447, "kl": 0.007873150054365396, "learning_rate": 8.694444444444446e-07, "loss": 0.0004, "num_tokens": 2516869.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 156.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.007709124591201544, "kl": 0.0007352937536779791, "learning_rate": 8.68888888888889e-07, "loss": 0.0, "num_tokens": 2517165.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 156.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.23902134597301483, "kl": 0.06653273105621338, "learning_rate": 8.683333333333334e-07, "loss": 0.0036, "num_tokens": 2517473.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 156.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06978762149810791, "kl": 0.053723761811852455, "learning_rate": 8.677777777777778e-07, "loss": 0.0027, "num_tokens": 2517935.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 156.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.2927057147026062, "kl": 0.0344026698730886, "learning_rate": 8.672222222222223e-07, "loss": 0.0017, "num_tokens": 2518210.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 156.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 3.072303661610931e-05, "kl": 4.336237907409668e-06, "learning_rate": 8.666666666666668e-07, "loss": 0.0, "num_tokens": 2518430.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 156.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.21215026080608368, "kl": 0.03580721328034997, "learning_rate": 8.661111111111112e-07, "loss": 0.0019, "num_tokens": 2518722.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 156.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.021785857155919075, "kl": 0.013549901079386473, "learning_rate": 8.655555555555556e-07, "loss": 0.0007, "num_tokens": 2518982.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 156.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.8480002880096436, "kl": 0.1068955585360527, "learning_rate": 8.65e-07, "loss": 0.1301, "num_tokens": 2519283.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 156.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.035844236612319946, "kl": 0.23633172363042831, "learning_rate": 8.644444444444445e-07, "loss": 0.0118, "num_tokens": 2519584.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 156.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 4.220936298370361, "kl": 0.49065499007701874, "learning_rate": 8.63888888888889e-07, "loss": -0.0243, "num_tokens": 2519866.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 156.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 6.158374786376953, "kl": 0.012466434622183442, "learning_rate": 8.633333333333334e-07, "loss": 0.1859, "num_tokens": 2520141.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 156.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.04576678201556206, "kl": 0.0047461241483688354, "learning_rate": 8.627777777777778e-07, "loss": 0.0002, "num_tokens": 2520401.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 156.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02380075491964817, "kl": 0.013819561339914799, "learning_rate": 8.622222222222224e-07, "loss": 0.0007, "num_tokens": 2520691.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 156.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.002866714494302869, "kl": 0.28225545585155487, "learning_rate": 8.616666666666667e-07, "loss": 0.0141, "num_tokens": 2520979.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 156.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.007337206043303013, "kl": 0.0009489655494689941, "learning_rate": 8.611111111111112e-07, "loss": 0.0, "num_tokens": 2521191.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 156.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05260277912020683, "kl": 0.06543147191405296, "learning_rate": 8.605555555555556e-07, "loss": 0.0033, "num_tokens": 2521536.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 156.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009877750650048256, "kl": 0.0002199113368988037, "learning_rate": 8.6e-07, "loss": 0.0, "num_tokens": 2521792.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 156.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.045698132365942, "kl": 0.078315868973732, "learning_rate": 8.594444444444446e-07, "loss": 0.0038, "num_tokens": 2522225.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 156.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 4.612532138824463, "kl": 0.19653873890638351, "learning_rate": 8.58888888888889e-07, "loss": 0.1438, "num_tokens": 2522546.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 156.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.003853857982903719, "kl": 0.0005008697626180947, "learning_rate": 8.583333333333334e-07, "loss": 0.0, "num_tokens": 2522766.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 156.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01059019286185503, "kl": 0.0015007869806140661, "learning_rate": 8.577777777777778e-07, "loss": 0.0001, "num_tokens": 2523088.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 156.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02278665453195572, "kl": 0.010671626776456833, "learning_rate": 8.572222222222223e-07, "loss": 0.0005, "num_tokens": 2523400.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 156.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.7413381338119507, "kl": 0.12253905832767487, "learning_rate": 8.566666666666668e-07, "loss": 0.0104, "num_tokens": 2523808.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 156.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.018226606771349907, "kl": 0.0066913701593875885, "learning_rate": 8.561111111111111e-07, "loss": 0.0003, "num_tokens": 2524126.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 156.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01674228347837925, "kl": 0.000738351111067459, "learning_rate": 8.555555555555556e-07, "loss": 0.0, "num_tokens": 2524387.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 156.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.8969456553459167, "kl": 0.2912696897983551, "learning_rate": 8.550000000000002e-07, "loss": 0.0141, "num_tokens": 2524716.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 156.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.0992064476013184, "kl": 0.1444309949874878, "learning_rate": 8.544444444444445e-07, "loss": 0.0202, "num_tokens": 2525020.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 156.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.5588340759277344, "kl": 0.04078717343509197, "learning_rate": 8.53888888888889e-07, "loss": -0.1011, "num_tokens": 2525378.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 156.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.012561533600091934, "kl": 0.00807950645685196, "learning_rate": 8.533333333333334e-07, "loss": 0.0004, "num_tokens": 2525614.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 156.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.04261275380849838, "kl": 0.003760814666748047, "learning_rate": 8.527777777777778e-07, "loss": 0.0002, "num_tokens": 2525822.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 156.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.008895041421055794, "kl": 0.0021835550433024764, "learning_rate": 8.522222222222224e-07, "loss": 0.0001, "num_tokens": 2526099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 156.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.008667952381074429, "kl": 0.000825261726276949, "learning_rate": 8.516666666666667e-07, "loss": 0.0, "num_tokens": 2526413.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 156.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.04781199246644974, "kl": 0.01046080607920885, "learning_rate": 8.511111111111112e-07, "loss": 0.0005, "num_tokens": 2526747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 156.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05530106648802757, "kl": 0.010603112168610096, "learning_rate": 8.505555555555556e-07, "loss": 0.0005, "num_tokens": 2527020.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 156.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.018829021602869034, "kl": 0.0008746690582484007, "learning_rate": 8.500000000000001e-07, "loss": 0.0, "num_tokens": 2527255.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 156.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.7388808727264404, "kl": 0.15483305417001247, "learning_rate": 8.494444444444446e-07, "loss": 0.0087, "num_tokens": 2527560.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 156.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.019276341423392296, "kl": 0.03491540718823671, "learning_rate": 8.488888888888889e-07, "loss": 0.0017, "num_tokens": 2527944.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 156.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.021502088755369186, "kl": 0.00993132358416915, "learning_rate": 8.483333333333334e-07, "loss": 0.0005, "num_tokens": 2528263.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 156.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.18535174429416656, "kl": 0.024773845449090004, "learning_rate": 8.477777777777778e-07, "loss": 0.0013, "num_tokens": 2528604.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 156.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.10909891128540039, "kl": 0.1322271078824997, "learning_rate": 8.472222222222223e-07, "loss": 0.0066, "num_tokens": 2528966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 156.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05213681608438492, "kl": 0.005539629841223359, "learning_rate": 8.466666666666668e-07, "loss": 0.0003, "num_tokens": 2529252.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 157.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06956744939088821, "kl": 0.1585172563791275, "learning_rate": 8.461111111111111e-07, "loss": 0.0079, "num_tokens": 2529539.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 157.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.007345099933445454, "kl": 0.0009210258722305298, "learning_rate": 8.455555555555556e-07, "loss": 0.0, "num_tokens": 2529751.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 157.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00034483964554965496, "kl": 2.8833746910095215e-05, "learning_rate": 8.450000000000002e-07, "loss": 0.0, "num_tokens": 2529971.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 157.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.20293383300304413, "kl": 0.040495860390365124, "learning_rate": 8.444444444444445e-07, "loss": 0.0021, "num_tokens": 2530263.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 157.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.052904196083545685, "kl": 0.001869291067123413, "learning_rate": 8.43888888888889e-07, "loss": 0.0001, "num_tokens": 2530475.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 157.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027683794032782316, "kl": 0.000181788214831613, "learning_rate": 8.433333333333333e-07, "loss": 0.0, "num_tokens": 2530735.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 157.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.034870147705078125, "kl": 0.0047440206399187446, "learning_rate": 8.427777777777778e-07, "loss": 0.0002, "num_tokens": 2531005.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 70.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 70.0, "completions/mean_terminated_length": 70.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 157.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.3835604190826416, "kl": 0.13673484697937965, "learning_rate": 8.422222222222224e-07, "loss": 0.357, "num_tokens": 2531509.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 157.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.031131740659475327, "kl": 0.03779991390183568, "learning_rate": 8.416666666666667e-07, "loss": 0.0019, "num_tokens": 2531800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 157.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.05172140151262283, "kl": 0.041300226002931595, "learning_rate": 8.411111111111112e-07, "loss": 0.002, "num_tokens": 2532176.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 157.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03793235123157501, "kl": 0.009634940419346094, "learning_rate": 8.405555555555555e-07, "loss": 0.0005, "num_tokens": 2532456.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 157.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.09595293551683426, "kl": 0.005947023630142212, "learning_rate": 8.400000000000001e-07, "loss": 0.0004, "num_tokens": 2532662.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 157.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03736020624637604, "kl": 0.010665329173207283, "learning_rate": 8.394444444444446e-07, "loss": 0.0005, "num_tokens": 2532979.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 157.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.12586939334869385, "kl": 0.01141316443681717, "learning_rate": 8.388888888888889e-07, "loss": 0.0006, "num_tokens": 2533223.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 157.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.00789565872400999, "kl": 0.0006844750605523586, "learning_rate": 8.383333333333334e-07, "loss": 0.0, "num_tokens": 2533537.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 157.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.010117822326719761, "kl": 0.0002429842934361659, "learning_rate": 8.37777777777778e-07, "loss": 0.0, "num_tokens": 2533793.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 157.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.00264229578897357, "kl": 0.28236332535743713, "learning_rate": 8.372222222222223e-07, "loss": 0.0141, "num_tokens": 2534081.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 157.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.15220826864242554, "kl": 0.017015773802995682, "learning_rate": 8.366666666666668e-07, "loss": 0.0009, "num_tokens": 2534341.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 157.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.10819786787033081, "kl": 0.022829510271549225, "learning_rate": 8.361111111111111e-07, "loss": 0.0012, "num_tokens": 2534624.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 157.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.1095058917999268, "kl": 0.08294228464365005, "learning_rate": 8.355555555555556e-07, "loss": -0.0017, "num_tokens": 2535031.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 157.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888051986694336, "kl": 0.024199441075325012, "learning_rate": 8.350000000000002e-07, "loss": 0.0014, "num_tokens": 2535321.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 157.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.010611021891236305, "kl": 0.008230927400290966, "learning_rate": 8.344444444444445e-07, "loss": 0.0004, "num_tokens": 2535633.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 157.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.043865129351615906, "kl": 0.07931235060095787, "learning_rate": 8.33888888888889e-07, "loss": 0.004, "num_tokens": 2536000.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 157.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.09828624874353409, "kl": 0.014080651104450226, "learning_rate": 8.333333333333333e-07, "loss": 0.0007, "num_tokens": 2536335.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 157.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.1784076988697052, "kl": 0.03053449373692274, "learning_rate": 8.327777777777779e-07, "loss": 0.0015, "num_tokens": 2536623.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 157.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.25392580032348633, "kl": 0.02784003084525466, "learning_rate": 8.322222222222224e-07, "loss": 0.0015, "num_tokens": 2536966.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 157.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 0.7595045566558838, "kl": 0.20430171862244606, "learning_rate": 8.316666666666667e-07, "loss": -0.1435, "num_tokens": 2537381.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 157.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0422712042927742, "kl": 0.23742271959781647, "learning_rate": 8.311111111111112e-07, "loss": 0.0118, "num_tokens": 2537682.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 157.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0926942452788353, "kl": 0.06728080287575722, "learning_rate": 8.305555555555555e-07, "loss": 0.0031, "num_tokens": 2537998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 157.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02170669287443161, "kl": 0.006665390450507402, "learning_rate": 8.300000000000001e-07, "loss": 0.0003, "num_tokens": 2538326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 157.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.677144765853882, "kl": 0.007026412524282932, "learning_rate": 8.294444444444446e-07, "loss": -0.024, "num_tokens": 2538631.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 157.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.9943318367004395, "kl": 0.13968484103679657, "learning_rate": 8.288888888888889e-07, "loss": 0.0912, "num_tokens": 2538985.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 8509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 157.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0451001338660717, "kl": 0.006704659492243081, "learning_rate": 8.283333333333334e-07, "loss": 0.0004, "num_tokens": 2539257.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 157.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.012082036584615707, "kl": 0.0007601536635775119, "learning_rate": 8.277777777777779e-07, "loss": 0.0, "num_tokens": 2539492.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 157.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04644699767231941, "kl": 0.017071343027055264, "learning_rate": 8.272222222222223e-07, "loss": 0.0009, "num_tokens": 2539818.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 157.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 8.394041061401367, "kl": 0.5057739615440369, "learning_rate": 8.266666666666668e-07, "loss": 0.0073, "num_tokens": 2540101.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 157.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.012097484432160854, "kl": 0.0027090541552752256, "learning_rate": 8.261111111111111e-07, "loss": 0.0001, "num_tokens": 2540403.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 157.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.042024530470371246, "kl": 0.0053962767124176025, "learning_rate": 8.255555555555556e-07, "loss": 0.0003, "num_tokens": 2540687.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 157.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01689073257148266, "kl": 0.035517267882823944, "learning_rate": 8.250000000000001e-07, "loss": 0.0017, "num_tokens": 2541066.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 157.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 5.676070690155029, "kl": 0.2631717324256897, "learning_rate": 8.244444444444445e-07, "loss": 0.1929, "num_tokens": 2541367.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 157.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.03556227684021, "kl": 0.20892271399497986, "learning_rate": 8.23888888888889e-07, "loss": 0.0031, "num_tokens": 2541724.0, "reward": 4.625, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 4.308422088623047, "step": 8518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 157.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.4099169969558716, "kl": 0.0514442422427237, "learning_rate": 8.233333333333333e-07, "loss": 0.003, "num_tokens": 2541998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 157.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.033930301666259766, "kl": 0.03781995177268982, "learning_rate": 8.227777777777779e-07, "loss": 0.0019, "num_tokens": 2542293.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 157.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03768501430749893, "kl": 0.04602782242000103, "learning_rate": 8.222222222222223e-07, "loss": 0.0023, "num_tokens": 2542773.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 157.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.015923617407679558, "kl": 0.0016374543774873018, "learning_rate": 8.216666666666667e-07, "loss": 0.0001, "num_tokens": 2542992.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 157.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.023705219849944115, "kl": 0.17144690454006195, "learning_rate": 8.211111111111112e-07, "loss": 0.0086, "num_tokens": 2543302.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 157.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.011794921942055225, "kl": 0.008267708122730255, "learning_rate": 8.205555555555557e-07, "loss": 0.0004, "num_tokens": 2543538.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 157.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.06511075794696808, "kl": 0.009512116841506213, "learning_rate": 8.200000000000001e-07, "loss": 0.0005, "num_tokens": 2543865.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 157.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.027783164754509926, "kl": 0.012502485420554876, "learning_rate": 8.194444444444446e-07, "loss": 0.0006, "num_tokens": 2544125.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 157.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.021274350583553314, "kl": 0.0017437011119909585, "learning_rate": 8.188888888888889e-07, "loss": 0.0001, "num_tokens": 2544383.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 157.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.11089117079973221, "kl": 0.04838044010102749, "learning_rate": 8.183333333333334e-07, "loss": 0.0024, "num_tokens": 2544673.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 157.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.006860919296741486, "kl": 0.0005891919136047363, "learning_rate": 8.177777777777779e-07, "loss": 0.0, "num_tokens": 2544969.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 157.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.027210572734475136, "kl": 0.007547880057245493, "learning_rate": 8.172222222222223e-07, "loss": 0.0004, "num_tokens": 2545242.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 157.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.169637992978096, "kl": 0.029637997969985008, "learning_rate": 8.166666666666668e-07, "loss": 0.0016, "num_tokens": 2545522.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.0, "frac_reward_zero_std": 0.0, "grad_norm": 8.55252456665039, "kl": 1.4069570414721966, "learning_rate": 8.161111111111111e-07, "loss": 0.5466, "num_tokens": 2545788.0, "reward": 1.2999999523162842, "reward_std": 3.6823906898498535, "rewards/reward_combined/mean": 1.2999999523162842, "rewards/reward_combined/std": 3.6823906898498535, "step": 8532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 158.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.5596407651901245, "kl": 0.2926347851753235, "learning_rate": 8.155555555555557e-07, "loss": -0.0102, "num_tokens": 2546155.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 158.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.057070985436439514, "kl": 0.2883012890815735, "learning_rate": 8.150000000000001e-07, "loss": 0.0144, "num_tokens": 2546443.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 158.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02273504249751568, "kl": 0.00865155877545476, "learning_rate": 8.144444444444445e-07, "loss": 0.0004, "num_tokens": 2546759.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 158.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.014849398285150528, "kl": 0.00812439899891615, "learning_rate": 8.13888888888889e-07, "loss": 0.0004, "num_tokens": 2547071.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 158.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 9.383576393127441, "kl": 0.005300122778862715, "learning_rate": 8.133333333333333e-07, "loss": 0.2164, "num_tokens": 2547291.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 158.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03661257401108742, "kl": 0.010465209372341633, "learning_rate": 8.127777777777779e-07, "loss": 0.0005, "num_tokens": 2547569.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 158.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.019774550572037697, "kl": 0.0032163881696760654, "learning_rate": 8.122222222222223e-07, "loss": 0.0002, "num_tokens": 2547853.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 37.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 158.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.44357624650001526, "kl": 0.17782670259475708, "learning_rate": 8.116666666666667e-07, "loss": 0.0092, "num_tokens": 2548225.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.1469622403383255, "kl": 0.03613673895597458, "learning_rate": 8.111111111111112e-07, "loss": 0.0021, "num_tokens": 2548452.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 5.664015770889819e-05, "kl": 5.066394805908203e-06, "learning_rate": 8.105555555555557e-07, "loss": 0.0, "num_tokens": 2548672.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 158.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.024565916508436203, "kl": 0.01999256831186358, "learning_rate": 8.100000000000001e-07, "loss": 0.001, "num_tokens": 2548960.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.011899351142346859, "kl": 0.008246593177318573, "learning_rate": 8.094444444444445e-07, "loss": 0.0004, "num_tokens": 2549196.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 158.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 3.335984468460083, "kl": 0.18453818559646606, "learning_rate": 8.088888888888889e-07, "loss": 0.0493, "num_tokens": 2549487.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.75, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 158.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.4211270809173584, "kl": 0.07316446304321289, "learning_rate": 8.083333333333334e-07, "loss": 0.0228, "num_tokens": 2549886.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 8546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01598695106804371, "kl": 0.0007632037450093776, "learning_rate": 8.077777777777779e-07, "loss": 0.0, "num_tokens": 2550145.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015199462650343776, "kl": 0.001849511987529695, "learning_rate": 8.072222222222223e-07, "loss": 0.0001, "num_tokens": 2550422.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 158.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05245418846607208, "kl": 0.0014176488330122083, "learning_rate": 8.066666666666667e-07, "loss": 0.0001, "num_tokens": 2550678.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 158.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.04756280779838562, "kl": 0.004613341996446252, "learning_rate": 8.061111111111111e-07, "loss": 0.0002, "num_tokens": 2550946.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 158.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.036881644278764725, "kl": 0.22337517887353897, "learning_rate": 8.055555555555557e-07, "loss": 0.0112, "num_tokens": 2551249.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 158.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.06910208612680435, "kl": 0.05630475655198097, "learning_rate": 8.050000000000001e-07, "loss": 0.0028, "num_tokens": 2551704.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 158.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.020307330414652824, "kl": 0.0019635752541944385, "learning_rate": 8.044444444444445e-07, "loss": 0.0001, "num_tokens": 2552025.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.02157142013311386, "kl": 0.007667801808565855, "learning_rate": 8.038888888888889e-07, "loss": 0.0004, "num_tokens": 2552300.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 158.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.033330515027046204, "kl": 0.010705887340009212, "learning_rate": 8.033333333333335e-07, "loss": 0.0005, "num_tokens": 2552631.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.017070775851607323, "kl": 0.0016063988441601396, "learning_rate": 8.027777777777779e-07, "loss": 0.0001, "num_tokens": 2552850.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 158.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.00982145220041275, "kl": 0.0007188830932136625, "learning_rate": 8.022222222222223e-07, "loss": 0.0, "num_tokens": 2553085.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 158.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.2373600006103516, "kl": 0.08969419822096825, "learning_rate": 8.016666666666667e-07, "loss": 0.2774, "num_tokens": 2553489.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8558 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 158.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.081714391708374, "kl": 0.06399360299110413, "learning_rate": 8.011111111111111e-07, "loss": -0.1409, "num_tokens": 2553770.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 158.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.009470462799072, "kl": 0.1658046878874302, "learning_rate": 8.005555555555557e-07, "loss": 0.011, "num_tokens": 2554079.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 8560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 158.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.032013002783060074, "kl": 0.0017960083205252886, "learning_rate": 8.000000000000001e-07, "loss": 0.0001, "num_tokens": 2554373.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 158.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.12137462943792343, "kl": 0.03439438156783581, "learning_rate": 7.994444444444445e-07, "loss": 0.0017, "num_tokens": 2554666.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.025754859670996666, "kl": 0.04684752970933914, "learning_rate": 7.988888888888889e-07, "loss": 0.0026, "num_tokens": 2554942.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 158.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03308752551674843, "kl": 0.019040136598050594, "learning_rate": 7.983333333333335e-07, "loss": 0.001, "num_tokens": 2555269.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 158.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0284771379083395, "kl": 0.012323141098022461, "learning_rate": 7.977777777777779e-07, "loss": 0.0006, "num_tokens": 2555529.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 158.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.029937492683529854, "kl": 0.0025916837621480227, "learning_rate": 7.972222222222223e-07, "loss": 0.0001, "num_tokens": 2555785.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 158.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.020968705415725708, "kl": 0.0004894480225630105, "learning_rate": 7.966666666666667e-07, "loss": 0.0, "num_tokens": 2555998.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 5.162417411804199, "kl": 0.05522117391228676, "learning_rate": 7.961111111111111e-07, "loss": 0.026, "num_tokens": 2556297.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08272397518157959, "kl": 0.061282746493816376, "learning_rate": 7.955555555555557e-07, "loss": 0.0031, "num_tokens": 2556583.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 158.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.252150535583496, "kl": 0.17839960381388664, "learning_rate": 7.950000000000001e-07, "loss": 0.0422, "num_tokens": 2556960.0, "reward": 4.625, "reward_std": 5.75, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 5.75, "step": 8570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 158.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07702163606882095, "kl": 0.0846282746642828, "learning_rate": 7.944444444444445e-07, "loss": 0.004, "num_tokens": 2557277.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 158.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 4.372359752655029, "kl": 0.07459400128573179, "learning_rate": 7.938888888888889e-07, "loss": 0.0247, "num_tokens": 2557614.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 158.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.27239325642585754, "kl": 0.04558637551963329, "learning_rate": 7.933333333333335e-07, "loss": 0.002, "num_tokens": 2557942.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 158.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0275861956179142, "kl": 0.002302202490682248, "learning_rate": 7.927777777777779e-07, "loss": 0.0001, "num_tokens": 2558202.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 158.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.033963702619075775, "kl": 0.0029247254133224487, "learning_rate": 7.922222222222223e-07, "loss": 0.0002, "num_tokens": 2558410.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 158.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.006909782998263836, "kl": 0.0751451775431633, "learning_rate": 7.916666666666667e-07, "loss": 0.0037, "num_tokens": 2558846.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 158.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.7182960510253906, "kl": 0.08346747979521751, "learning_rate": 7.911111111111111e-07, "loss": -0.1706, "num_tokens": 2559224.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 8577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 158.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04544355720281601, "kl": 0.008628660347312689, "learning_rate": 7.905555555555557e-07, "loss": 0.0004, "num_tokens": 2559496.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 158.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.12938766181468964, "kl": 0.013773120939731598, "learning_rate": 7.900000000000001e-07, "loss": 0.0007, "num_tokens": 2559756.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 158.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04742911458015442, "kl": 0.002640897175297141, "learning_rate": 7.894444444444445e-07, "loss": 0.0001, "num_tokens": 2559999.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 158.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.008011800236999989, "kl": 0.0007143835246097296, "learning_rate": 7.888888888888889e-07, "loss": 0.0, "num_tokens": 2560313.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 158.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.368595600128174, "kl": 0.17921407520771027, "learning_rate": 7.883333333333334e-07, "loss": 0.0238, "num_tokens": 2560644.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 8582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 158.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.020189620554447174, "kl": 0.007895810529589653, "learning_rate": 7.877777777777779e-07, "loss": 0.0004, "num_tokens": 2560932.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 158.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03557233884930611, "kl": 0.011021747253835201, "learning_rate": 7.872222222222223e-07, "loss": 0.0005, "num_tokens": 2561227.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 158.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.17599409818649292, "kl": 0.4191306382417679, "learning_rate": 7.866666666666667e-07, "loss": 0.0209, "num_tokens": 2561512.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 159.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.052758872509002686, "kl": 0.15223341435194016, "learning_rate": 7.861111111111113e-07, "loss": 0.0077, "num_tokens": 2561820.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 159.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035217388067394495, "kl": 0.000445622208644636, "learning_rate": 7.855555555555556e-07, "loss": 0.0, "num_tokens": 2562040.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 159.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 4.282203197479248, "kl": 0.13625355390831828, "learning_rate": 7.850000000000001e-07, "loss": 0.0266, "num_tokens": 2562372.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.012875201180577278, "kl": 0.004510755999945104, "learning_rate": 7.844444444444445e-07, "loss": 0.0002, "num_tokens": 2562660.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 159.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.08269461989402771, "kl": 0.015432554529979825, "learning_rate": 7.838888888888889e-07, "loss": 0.0008, "num_tokens": 2562992.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 159.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.026284340769052505, "kl": 0.0025786944897845387, "learning_rate": 7.833333333333335e-07, "loss": 0.0001, "num_tokens": 2563254.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 159.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.022229257971048355, "kl": 0.011157507076859474, "learning_rate": 7.827777777777778e-07, "loss": 0.0006, "num_tokens": 2563566.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0138197997584939, "kl": 0.002066040993668139, "learning_rate": 7.822222222222223e-07, "loss": 0.0001, "num_tokens": 2563847.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 159.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.3177683353424072, "kl": 0.1852301061153412, "learning_rate": 7.816666666666667e-07, "loss": -0.0907, "num_tokens": 2564161.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.04588257893919945, "kl": 0.040198428090661764, "learning_rate": 7.811111111111112e-07, "loss": 0.002, "num_tokens": 2564453.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 159.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.025915948674082756, "kl": 0.008329503238201141, "learning_rate": 7.805555555555557e-07, "loss": 0.0004, "num_tokens": 2564771.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 159.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 14.074394226074219, "kl": 0.15285054594278336, "learning_rate": 7.8e-07, "loss": -0.0595, "num_tokens": 2564982.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013513513840734959, "clip_ratio/low_min": 0.013513513840734959, "clip_ratio/region_mean": 0.013513513840734959, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 159.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 5.850082874298096, "kl": 0.029232933185994625, "learning_rate": 7.794444444444445e-07, "loss": 0.2565, "num_tokens": 2565261.0, "reward": 5.375, "reward_std": 4.25, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 4.25, "step": 8598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 159.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.07089092582464218, "kl": 0.01252398593351245, "learning_rate": 7.788888888888889e-07, "loss": 0.0006, "num_tokens": 2565592.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 159.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0073170908726751804, "kl": 0.17490877211093903, "learning_rate": 7.783333333333334e-07, "loss": 0.0087, "num_tokens": 2565900.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 159.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.918499231338501, "kl": 0.02519828057847917, "learning_rate": 7.777777777777779e-07, "loss": 0.3008, "num_tokens": 2566188.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 87.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 87.5, "completions/mean_terminated_length": 31.33333396911621, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 159.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.6259267330169678, "kl": 0.058650944381952286, "learning_rate": 7.772222222222223e-07, "loss": 0.394, "num_tokens": 2566762.0, "reward": 4.625, "reward_std": 5.75, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 5.75, "step": 8602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 159.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07593303918838501, "kl": 0.018075603526085615, "learning_rate": 7.766666666666667e-07, "loss": 0.0009, "num_tokens": 2567067.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 159.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03194275498390198, "kl": 0.01847834885120392, "learning_rate": 7.761111111111113e-07, "loss": 0.0009, "num_tokens": 2567398.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 159.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.042969297617673874, "kl": 0.0007576167699880898, "learning_rate": 7.755555555555556e-07, "loss": 0.0, "num_tokens": 2567654.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 159.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 5.688396777259186e-05, "kl": 5.252659320831299e-06, "learning_rate": 7.750000000000001e-07, "loss": 0.0, "num_tokens": 2567874.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 159.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.1351388394832611, "kl": 0.04726475663483143, "learning_rate": 7.744444444444445e-07, "loss": 0.0024, "num_tokens": 2568211.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 159.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.6013656258583069, "kl": 0.22809038311243057, "learning_rate": 7.738888888888889e-07, "loss": 0.0113, "num_tokens": 2568505.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 159.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.1408982127904892, "kl": 0.06649911776185036, "learning_rate": 7.733333333333335e-07, "loss": 0.0033, "num_tokens": 2568778.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 159.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.3107014298439026, "kl": 0.0890615563839674, "learning_rate": 7.727777777777778e-07, "loss": 0.0045, "num_tokens": 2569087.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 159.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 2.1983306407928467, "kl": 0.1662871465086937, "learning_rate": 7.722222222222223e-07, "loss": 0.0168, "num_tokens": 2569496.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 51.25, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 159.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08004983514547348, "kl": 0.05631932243704796, "learning_rate": 7.716666666666667e-07, "loss": 0.0028, "num_tokens": 2569981.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 159.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.007479813415557146, "kl": 0.0013135522603988647, "learning_rate": 7.711111111111112e-07, "loss": 0.0001, "num_tokens": 2570193.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 159.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.716184139251709, "kl": 0.6931111142039299, "learning_rate": 7.705555555555557e-07, "loss": 0.0214, "num_tokens": 2570480.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 159.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 6.346282958984375, "kl": 0.0382201261818409, "learning_rate": 7.7e-07, "loss": 0.0283, "num_tokens": 2570725.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.009333653375506401, "kl": 0.23912183940410614, "learning_rate": 7.694444444444445e-07, "loss": 0.0119, "num_tokens": 2571025.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 159.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.008893478661775589, "kl": 0.0010676817619241774, "learning_rate": 7.688888888888891e-07, "loss": 0.0001, "num_tokens": 2571321.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 159.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.010838386602699757, "kl": 0.02713806927204132, "learning_rate": 7.683333333333334e-07, "loss": 0.0014, "num_tokens": 2571537.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 159.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 23.0747013092041, "kl": 0.19696897268295288, "learning_rate": 7.677777777777779e-07, "loss": 0.2636, "num_tokens": 2571747.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 159.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.3320097029209137, "kl": 0.03500998765230179, "learning_rate": 7.672222222222222e-07, "loss": 0.0022, "num_tokens": 2571987.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 159.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.013897508382797241, "kl": 0.001402101363055408, "learning_rate": 7.666666666666667e-07, "loss": 0.0001, "num_tokens": 2572256.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 159.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.027345256879925728, "kl": 0.4360329359769821, "learning_rate": 7.661111111111113e-07, "loss": 0.0218, "num_tokens": 2572540.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 159.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024301304947584867, "kl": 0.2824498862028122, "learning_rate": 7.655555555555556e-07, "loss": 0.0141, "num_tokens": 2572828.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 159.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.006667387671768665, "kl": 0.0007195902871899307, "learning_rate": 7.650000000000001e-07, "loss": 0.0, "num_tokens": 2573140.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 159.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 7.311020374298096, "kl": 0.041833131457678974, "learning_rate": 7.644444444444444e-07, "loss": 0.1756, "num_tokens": 2573384.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 8625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 159.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.2360103577375412, "kl": 0.02738371305167675, "learning_rate": 7.63888888888889e-07, "loss": 0.0016, "num_tokens": 2573649.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 159.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.3041801452636719, "kl": 0.03150467108935118, "learning_rate": 7.633333333333335e-07, "loss": 0.0016, "num_tokens": 2573951.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 159.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.013735211454331875, "kl": 0.0028435817221179605, "learning_rate": 7.627777777777778e-07, "loss": 0.0001, "num_tokens": 2574235.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 159.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.10941784083843231, "kl": 0.026277159340679646, "learning_rate": 7.622222222222223e-07, "loss": 0.0013, "num_tokens": 2574533.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 159.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09795306622982025, "kl": 0.00995330885052681, "learning_rate": 7.616666666666666e-07, "loss": 0.0005, "num_tokens": 2574793.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 159.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03326079621911049, "kl": 0.038264671340584755, "learning_rate": 7.611111111111112e-07, "loss": 0.0019, "num_tokens": 2575088.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 159.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09396833926439285, "kl": 0.1382615566253662, "learning_rate": 7.605555555555557e-07, "loss": 0.0069, "num_tokens": 2575418.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.05532779544591904, "kl": 0.01400191243737936, "learning_rate": 7.6e-07, "loss": 0.0007, "num_tokens": 2575696.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 159.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.11454017460346222, "kl": 0.0832216739654541, "learning_rate": 7.594444444444445e-07, "loss": 0.0042, "num_tokens": 2576139.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 159.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03576379641890526, "kl": 0.06682749837636948, "learning_rate": 7.588888888888891e-07, "loss": 0.0032, "num_tokens": 2576502.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 159.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 4.0003228187561035, "kl": 0.09298527613282204, "learning_rate": 7.583333333333334e-07, "loss": -0.0388, "num_tokens": 2576865.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 8636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 159.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.011917726136744022, "kl": 0.0017241272144019604, "learning_rate": 7.577777777777779e-07, "loss": 0.0001, "num_tokens": 2577188.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 159.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1087518110871315, "kl": 0.11632949113845825, "learning_rate": 7.572222222222222e-07, "loss": 0.006, "num_tokens": 2577534.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 159.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.16862213611602783, "kl": 0.04723906982690096, "learning_rate": 7.566666666666667e-07, "loss": 0.0027, "num_tokens": 2577813.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 160.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.29489049315452576, "kl": 0.02602849411778152, "learning_rate": 7.561111111111113e-07, "loss": 0.0017, "num_tokens": 2578090.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 160.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02180519886314869, "kl": 0.011120177805423737, "learning_rate": 7.555555555555556e-07, "loss": 0.0006, "num_tokens": 2578402.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 160.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.141204595565796, "kl": 0.11307127773761749, "learning_rate": 7.550000000000001e-07, "loss": 0.0664, "num_tokens": 2578790.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 160.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.060298409312963486, "kl": 0.0035877451300621033, "learning_rate": 7.544444444444444e-07, "loss": 0.0002, "num_tokens": 2579002.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04624953493475914, "kl": 0.031415000557899475, "learning_rate": 7.53888888888889e-07, "loss": 0.0016, "num_tokens": 2579292.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 160.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.007756698876619339, "kl": 0.0012585307704284787, "learning_rate": 7.533333333333335e-07, "loss": 0.0001, "num_tokens": 2579580.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 160.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04215005785226822, "kl": 0.004255413077771664, "learning_rate": 7.527777777777778e-07, "loss": 0.0002, "num_tokens": 2579829.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 160.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.10108750313520432, "kl": 0.010190781205892563, "learning_rate": 7.522222222222223e-07, "loss": 0.0005, "num_tokens": 2580089.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 160.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06795713305473328, "kl": 0.0019681528210639954, "learning_rate": 7.516666666666668e-07, "loss": 0.0001, "num_tokens": 2580301.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 160.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.01602315902709961, "kl": 0.0011250759707763791, "learning_rate": 7.511111111111112e-07, "loss": 0.0001, "num_tokens": 2580537.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 160.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072424402460455894, "kl": 0.17491436749696732, "learning_rate": 7.505555555555557e-07, "loss": 0.0087, "num_tokens": 2580845.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02768847905099392, "kl": 0.011512557975947857, "learning_rate": 7.5e-07, "loss": 0.0006, "num_tokens": 2581117.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 160.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024875516537576914, "kl": 0.2824254631996155, "learning_rate": 7.494444444444445e-07, "loss": 0.0141, "num_tokens": 2581405.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03699566796422005, "kl": 0.008219875395298004, "learning_rate": 7.48888888888889e-07, "loss": 0.0004, "num_tokens": 2581689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 160.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 2.1106181144714355, "kl": 0.24624864012002945, "learning_rate": 7.483333333333334e-07, "loss": 0.0106, "num_tokens": 2582055.0, "reward": 5.5, "reward_std": 2.309401035308838, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 2.309401035308838, "step": 8654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 160.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 3.3288353733951226e-05, "kl": 4.395842552185059e-06, "learning_rate": 7.477777777777779e-07, "loss": 0.0, "num_tokens": 2582275.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 160.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.05300474911928177, "kl": 0.023547668009996414, "learning_rate": 7.472222222222222e-07, "loss": 0.0012, "num_tokens": 2582576.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 160.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05214867368340492, "kl": 0.0025189935258822516, "learning_rate": 7.466666666666668e-07, "loss": 0.0001, "num_tokens": 2582832.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 160.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 7.273121356964111, "kl": 0.813946396112442, "learning_rate": 7.461111111111112e-07, "loss": 0.1407, "num_tokens": 2583176.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 160.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.012300224043428898, "kl": 0.008113138377666473, "learning_rate": 7.455555555555556e-07, "loss": 0.0004, "num_tokens": 2583412.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.749103307723999, "kl": 0.04958202503621578, "learning_rate": 7.450000000000001e-07, "loss": 0.1938, "num_tokens": 2583698.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.016568997874855995, "kl": 0.002559225831646472, "learning_rate": 7.444444444444444e-07, "loss": 0.0001, "num_tokens": 2583977.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 160.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140416383743286, "kl": 0.13209281861782074, "learning_rate": 7.43888888888889e-07, "loss": 0.1502, "num_tokens": 2584353.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 8662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 160.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06887120753526688, "kl": 0.015933023765683174, "learning_rate": 7.433333333333335e-07, "loss": 0.0008, "num_tokens": 2584692.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.14145739376544952, "kl": 0.024395578540861607, "learning_rate": 7.427777777777778e-07, "loss": 0.0011, "num_tokens": 2584970.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 160.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.007509865798056126, "kl": 0.004106778302229941, "learning_rate": 7.422222222222223e-07, "loss": 0.0002, "num_tokens": 2585234.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 160.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04255496338009834, "kl": 0.010828753001987934, "learning_rate": 7.416666666666668e-07, "loss": 0.0005, "num_tokens": 2585562.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 160.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.8952662348747253, "kl": 0.24613042175769806, "learning_rate": 7.411111111111112e-07, "loss": 0.0136, "num_tokens": 2585861.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 160.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.028812618926167488, "kl": 0.01032928703352809, "learning_rate": 7.405555555555557e-07, "loss": 0.0005, "num_tokens": 2586161.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 160.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.278554767370224, "kl": 0.06647798791527748, "learning_rate": 7.4e-07, "loss": 0.0036, "num_tokens": 2586474.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 160.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.024577636271715164, "kl": 0.0019646987202577293, "learning_rate": 7.394444444444445e-07, "loss": 0.0001, "num_tokens": 2586736.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.030112557113170624, "kl": 0.003950923914089799, "learning_rate": 7.38888888888889e-07, "loss": 0.0002, "num_tokens": 2587034.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 160.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.05718117207288742, "kl": 0.028510892763733864, "learning_rate": 7.383333333333334e-07, "loss": 0.0012, "num_tokens": 2587396.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 160.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.02495865523815155, "kl": 0.4355323165655136, "learning_rate": 7.377777777777779e-07, "loss": 0.0218, "num_tokens": 2587680.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 160.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.23175004124641418, "kl": 0.03839058429002762, "learning_rate": 7.372222222222222e-07, "loss": 0.0021, "num_tokens": 2587901.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 55.25, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 160.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.184823751449585, "kl": 0.07414969801902771, "learning_rate": 7.366666666666668e-07, "loss": 0.3307, "num_tokens": 2588346.0, "reward": 4.5, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 3.674234628677368, "step": 8675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 160.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030755209736526012, "kl": 0.00018930435908259824, "learning_rate": 7.361111111111112e-07, "loss": 0.0, "num_tokens": 2588606.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.41704970598220825, "kl": 0.04929552669636905, "learning_rate": 7.355555555555556e-07, "loss": 0.0027, "num_tokens": 2588904.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 160.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.015087327919900417, "kl": 0.003162097418680787, "learning_rate": 7.350000000000001e-07, "loss": 0.0002, "num_tokens": 2589188.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 160.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.020046573132276535, "kl": 0.002187970996601507, "learning_rate": 7.344444444444445e-07, "loss": 0.0001, "num_tokens": 2589497.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 160.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.007581006735563278, "kl": 0.001300722360610962, "learning_rate": 7.33888888888889e-07, "loss": 0.0001, "num_tokens": 2589709.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 29.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 160.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.07516289502382278, "kl": 0.11862561851739883, "learning_rate": 7.333333333333334e-07, "loss": 0.006, "num_tokens": 2590050.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 160.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06430783867835999, "kl": 0.016510052140802145, "learning_rate": 7.327777777777778e-07, "loss": 0.0009, "num_tokens": 2590376.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 160.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.18011054396629333, "kl": 0.032970087602734566, "learning_rate": 7.322222222222223e-07, "loss": 0.0027, "num_tokens": 2590690.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 40.0, "completions/mean_terminated_length": 40.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 160.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 0.8287065625190735, "kl": 0.10868450999259949, "learning_rate": 7.316666666666668e-07, "loss": 0.005, "num_tokens": 2591130.0, "reward": 1.125, "reward_std": 1.25, "rewards/reward_combined/mean": 1.125, "rewards/reward_combined/std": 1.25, "step": 8684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 160.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01950831338763237, "kl": 0.011124608106911182, "learning_rate": 7.311111111111112e-07, "loss": 0.0006, "num_tokens": 2591418.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 160.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07862893491983414, "kl": 0.1369815692305565, "learning_rate": 7.305555555555556e-07, "loss": 0.0068, "num_tokens": 2591748.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 160.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068129245191812515, "kl": 0.22633734345436096, "learning_rate": 7.3e-07, "loss": 0.0113, "num_tokens": 2592050.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 160.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.966058611869812, "kl": 0.00515765548334457, "learning_rate": 7.294444444444446e-07, "loss": 0.4756, "num_tokens": 2592412.0, "reward": 3.625, "reward_std": 0.75, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 0.75, "step": 8688 }, { "clip_ratio/high_max": 0.009999999776482582, "clip_ratio/high_mean": 0.009999999776482582, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009999999776482582, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 160.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 2.75573468208313, "kl": 0.09794112294912338, "learning_rate": 7.28888888888889e-07, "loss": -0.0154, "num_tokens": 2592727.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 160.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.017765862867236137, "kl": 0.0019282300490885973, "learning_rate": 7.283333333333334e-07, "loss": 0.0001, "num_tokens": 2593048.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 160.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.029876690357923508, "kl": 0.01777749741449952, "learning_rate": 7.277777777777778e-07, "loss": 0.0009, "num_tokens": 2593378.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 160.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.015538151375949383, "kl": 0.001153915945906192, "learning_rate": 7.272222222222222e-07, "loss": 0.0001, "num_tokens": 2593646.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 160.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0193488746881485, "kl": 0.01428111083805561, "learning_rate": 7.266666666666668e-07, "loss": 0.0007, "num_tokens": 2593906.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 43.75, "completions/mean_terminated_length": 43.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 161.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.2588146924972534, "kl": 0.09557468444108963, "learning_rate": 7.261111111111112e-07, "loss": 0.0048, "num_tokens": 2594349.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 161.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009763312409631908, "kl": 0.00015191734200925566, "learning_rate": 7.255555555555556e-07, "loss": 0.0, "num_tokens": 2594605.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 161.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07149023562669754, "kl": 0.016043067211285233, "learning_rate": 7.25e-07, "loss": 0.0011, "num_tokens": 2594882.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 161.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.021605586633086205, "kl": 0.011281829327344894, "learning_rate": 7.244444444444446e-07, "loss": 0.0006, "num_tokens": 2595194.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 161.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.010244259610772133, "kl": 0.0005275532603263855, "learning_rate": 7.23888888888889e-07, "loss": 0.0, "num_tokens": 2595406.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 3.4780900478363037, "kl": 0.03680648002773523, "learning_rate": 7.233333333333334e-07, "loss": 0.1193, "num_tokens": 2595687.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 161.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.22935037314891815, "kl": 0.04068848676979542, "learning_rate": 7.227777777777778e-07, "loss": 0.0022, "num_tokens": 2595987.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8700 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.01315789483487606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01315789483487606, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 161.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 1.940875768661499, "kl": 0.7140426188707352, "learning_rate": 7.222222222222222e-07, "loss": 0.0849, "num_tokens": 2596346.0, "reward": 7.0, "reward_std": 1.6832507848739624, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.683250904083252, "step": 8701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 161.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03578910231590271, "kl": 0.20091351587325335, "learning_rate": 7.216666666666668e-07, "loss": 0.0086, "num_tokens": 2596680.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 161.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.015147630125284195, "kl": 0.003930769395083189, "learning_rate": 7.211111111111112e-07, "loss": 0.0002, "num_tokens": 2596984.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 161.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.032209157943725586, "kl": 0.08230356127023697, "learning_rate": 7.205555555555556e-07, "loss": 0.0041, "num_tokens": 2597410.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 161.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.2515179216861725, "kl": 0.03782888932619244, "learning_rate": 7.2e-07, "loss": 0.0033, "num_tokens": 2597765.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 161.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014805740676820278, "kl": 0.002440673066303134, "learning_rate": 7.194444444444446e-07, "loss": 0.0001, "num_tokens": 2598047.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 161.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.02762470580637455, "kl": 0.003623608499765396, "learning_rate": 7.18888888888889e-07, "loss": 0.0002, "num_tokens": 2598307.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 161.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.4920818507671356, "kl": 0.07633948465809226, "learning_rate": 7.183333333333334e-07, "loss": 0.0041, "num_tokens": 2598644.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.039325300604104996, "kl": 0.011033754330128431, "learning_rate": 7.177777777777778e-07, "loss": 0.0006, "num_tokens": 2598912.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.07769180089235306, "kl": 0.0051200445159338415, "learning_rate": 7.172222222222223e-07, "loss": 0.0003, "num_tokens": 2599208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 161.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.038313981145620346, "kl": 0.02912721037864685, "learning_rate": 7.166666666666668e-07, "loss": 0.0015, "num_tokens": 2599500.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.026357024908065796, "kl": 0.007383981253951788, "learning_rate": 7.161111111111112e-07, "loss": 0.0004, "num_tokens": 2599773.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 161.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06782086193561554, "kl": 0.013055219314992428, "learning_rate": 7.155555555555556e-07, "loss": 0.0006, "num_tokens": 2600106.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 42.75, "completions/mean_terminated_length": 42.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 161.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.1300487518310547, "kl": 0.07946392707526684, "learning_rate": 7.15e-07, "loss": 0.0038, "num_tokens": 2600557.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 161.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 10.57738971710205, "kl": 0.03831977769732475, "learning_rate": 7.144444444444446e-07, "loss": -0.0196, "num_tokens": 2600905.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 161.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024163550697267056, "kl": 0.2824115455150604, "learning_rate": 7.13888888888889e-07, "loss": 0.0141, "num_tokens": 2601193.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 161.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.06022659316658974, "kl": 0.13408581912517548, "learning_rate": 7.133333333333334e-07, "loss": 0.0067, "num_tokens": 2601523.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 161.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 3.7046921253204346, "kl": 0.04491441510617733, "learning_rate": 7.127777777777778e-07, "loss": 0.0417, "num_tokens": 2601824.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 161.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.021703442558646202, "kl": 0.06901183724403381, "learning_rate": 7.122222222222223e-07, "loss": 0.0035, "num_tokens": 2602189.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 161.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02191501297056675, "kl": 0.0013033017166890204, "learning_rate": 7.116666666666668e-07, "loss": 0.0001, "num_tokens": 2602424.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 161.5, "frac_reward_zero_std": 0.0, "grad_norm": 2.6838600635528564, "kl": 0.12514445558190346, "learning_rate": 7.111111111111112e-07, "loss": 0.1194, "num_tokens": 2602784.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 8721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 161.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05236811935901642, "kl": 0.17498696595430374, "learning_rate": 7.105555555555556e-07, "loss": 0.0087, "num_tokens": 2603068.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 161.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0676974505186081, "kl": 0.004080387530848384, "learning_rate": 7.1e-07, "loss": 0.0002, "num_tokens": 2603311.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 161.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.004531720653176308, "kl": 0.0006735920906066895, "learning_rate": 7.094444444444445e-07, "loss": 0.0, "num_tokens": 2603531.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 161.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.14854742586612701, "kl": 0.09764118865132332, "learning_rate": 7.08888888888889e-07, "loss": 0.0049, "num_tokens": 2603840.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 161.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02885187603533268, "kl": 0.4357842206954956, "learning_rate": 7.083333333333334e-07, "loss": 0.0218, "num_tokens": 2604124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 161.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.4002673923969269, "kl": 0.16597672551870346, "learning_rate": 7.077777777777778e-07, "loss": 0.0085, "num_tokens": 2604474.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 161.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03358500450849533, "kl": 0.0035024603130295873, "learning_rate": 7.072222222222223e-07, "loss": 0.0002, "num_tokens": 2604801.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.032677531242370605, "kl": 0.04116937704384327, "learning_rate": 7.066666666666667e-07, "loss": 0.0021, "num_tokens": 2605089.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 161.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0116764260455966, "kl": 0.008203685283660889, "learning_rate": 7.061111111111112e-07, "loss": 0.0004, "num_tokens": 2605325.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 161.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.015408665873110294, "kl": 0.0026824738597497344, "learning_rate": 7.055555555555556e-07, "loss": 0.0001, "num_tokens": 2605595.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 161.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00754248071461916, "kl": 0.0012848526239395142, "learning_rate": 7.05e-07, "loss": 0.0001, "num_tokens": 2605807.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 161.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.23572535812854767, "kl": 0.016731038689613342, "learning_rate": 7.044444444444446e-07, "loss": 0.0012, "num_tokens": 2606017.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.048969682306051254, "kl": 0.010787187609821558, "learning_rate": 7.03888888888889e-07, "loss": 0.0005, "num_tokens": 2606286.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 161.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04629872366786003, "kl": 0.00366671709343791, "learning_rate": 7.033333333333334e-07, "loss": 0.0002, "num_tokens": 2606544.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 161.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.033702582120895386, "kl": 0.008034982020035386, "learning_rate": 7.027777777777778e-07, "loss": 0.0004, "num_tokens": 2606858.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 161.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.012332775630056858, "kl": 0.002668790169991553, "learning_rate": 7.022222222222223e-07, "loss": 0.0001, "num_tokens": 2607140.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 161.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07664754241704941, "kl": 0.1814245879650116, "learning_rate": 7.016666666666668e-07, "loss": 0.0091, "num_tokens": 2607452.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 161.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.07587151229381561, "kl": 0.02907184697687626, "learning_rate": 7.011111111111112e-07, "loss": 0.0015, "num_tokens": 2607671.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 161.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03433530777692795, "kl": 0.010925936046987772, "learning_rate": 7.005555555555556e-07, "loss": 0.0005, "num_tokens": 2607970.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 161.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.025093959644436836, "kl": 0.0003535225987434387, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "num_tokens": 2608190.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 161.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.00881755817681551, "kl": 0.23913726210594177, "learning_rate": 6.994444444444445e-07, "loss": 0.0119, "num_tokens": 2608490.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 161.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.4133559763431549, "kl": 0.11200284026563168, "learning_rate": 6.98888888888889e-07, "loss": 0.0055, "num_tokens": 2608856.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 161.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 7.780724048614502, "kl": 0.06893253326416016, "learning_rate": 6.983333333333334e-07, "loss": 0.0258, "num_tokens": 2609153.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 161.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03508951514959335, "kl": 0.008666804060339928, "learning_rate": 6.977777777777778e-07, "loss": 0.0004, "num_tokens": 2609469.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.75, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 161.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.020120110362768173, "kl": 0.026305802166461945, "learning_rate": 6.972222222222223e-07, "loss": 0.0013, "num_tokens": 2609868.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 161.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.053215205669403076, "kl": 0.007764179026708007, "learning_rate": 6.966666666666667e-07, "loss": 0.0004, "num_tokens": 2610158.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 162.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.01873633824288845, "kl": 0.014374779537320137, "learning_rate": 6.961111111111112e-07, "loss": 0.0007, "num_tokens": 2610418.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 162.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01776733808219433, "kl": 0.17060206085443497, "learning_rate": 6.955555555555556e-07, "loss": 0.0085, "num_tokens": 2610728.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 162.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.01645686849951744, "kl": 0.002744188765063882, "learning_rate": 6.950000000000001e-07, "loss": 0.0001, "num_tokens": 2611012.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 162.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024030960630625486, "kl": 0.28240492939949036, "learning_rate": 6.944444444444446e-07, "loss": 0.0141, "num_tokens": 2611300.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 162.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03304360434412956, "kl": 0.0007602125551784411, "learning_rate": 6.938888888888889e-07, "loss": 0.0, "num_tokens": 2611556.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 162.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.011818482540547848, "kl": 0.026815377175807953, "learning_rate": 6.933333333333334e-07, "loss": 0.0013, "num_tokens": 2611772.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 162.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.803168773651123, "kl": 0.5922785326838493, "learning_rate": 6.927777777777778e-07, "loss": 0.0269, "num_tokens": 2612057.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 8754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 162.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.007610258646309376, "kl": 0.0012304633855819702, "learning_rate": 6.922222222222223e-07, "loss": 0.0001, "num_tokens": 2612269.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 162.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03283996507525444, "kl": 0.0014274517307057977, "learning_rate": 6.916666666666668e-07, "loss": 0.0001, "num_tokens": 2612503.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 162.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.008872909471392632, "kl": 0.0011906840954907238, "learning_rate": 6.911111111111111e-07, "loss": 0.0001, "num_tokens": 2612799.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 162.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03852180391550064, "kl": 0.029351248871535063, "learning_rate": 6.905555555555556e-07, "loss": 0.0016, "num_tokens": 2613071.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 162.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.019160296767950058, "kl": 0.007208510767668486, "learning_rate": 6.900000000000001e-07, "loss": 0.0004, "num_tokens": 2613389.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 162.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0802341178059578, "kl": 0.013905809726566076, "learning_rate": 6.894444444444445e-07, "loss": 0.0007, "num_tokens": 2613669.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 162.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.042930759489536285, "kl": 0.003662863513454795, "learning_rate": 6.88888888888889e-07, "loss": 0.0002, "num_tokens": 2613918.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 162.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.06930079311132431, "kl": 0.09009018912911415, "learning_rate": 6.883333333333333e-07, "loss": 0.0042, "num_tokens": 2614341.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 162.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.7589569091796875, "kl": 0.20081252977252007, "learning_rate": 6.877777777777778e-07, "loss": -0.1576, "num_tokens": 2614665.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 162.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.008029725402593613, "kl": 0.0003635585308074951, "learning_rate": 6.872222222222223e-07, "loss": 0.0, "num_tokens": 2614877.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 162.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.005457321181893349, "kl": 0.0013581507373601198, "learning_rate": 6.866666666666667e-07, "loss": 0.0001, "num_tokens": 2615196.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 162.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.828214406967163, "kl": 0.02190638380125165, "learning_rate": 6.861111111111112e-07, "loss": -0.0147, "num_tokens": 2615487.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 162.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.040696993470191956, "kl": 0.014363248366862535, "learning_rate": 6.855555555555555e-07, "loss": 0.0007, "num_tokens": 2615805.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 162.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03489837422966957, "kl": 0.03717591613531113, "learning_rate": 6.850000000000001e-07, "loss": 0.0019, "num_tokens": 2616100.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 162.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.011629129759967327, "kl": 0.008262120187282562, "learning_rate": 6.844444444444446e-07, "loss": 0.0004, "num_tokens": 2616336.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 162.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03400389105081558, "kl": 0.07355647161602974, "learning_rate": 6.838888888888889e-07, "loss": 0.0037, "num_tokens": 2616703.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 162.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.017683256417512894, "kl": 0.0065354734542779624, "learning_rate": 6.833333333333334e-07, "loss": 0.0003, "num_tokens": 2616988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 162.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.4747467041015625, "kl": 0.5629072189331055, "learning_rate": 6.827777777777779e-07, "loss": 0.0288, "num_tokens": 2617272.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 162.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.2530249357223511, "kl": 0.035127343609929085, "learning_rate": 6.822222222222223e-07, "loss": 0.0019, "num_tokens": 2617597.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 162.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069051082246005535, "kl": 0.21369189023971558, "learning_rate": 6.816666666666668e-07, "loss": 0.0107, "num_tokens": 2617901.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 162.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.062090933322906494, "kl": 0.013207650743424892, "learning_rate": 6.811111111111111e-07, "loss": 0.0006, "num_tokens": 2618170.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 162.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.25242066383361816, "kl": 0.05828919820487499, "learning_rate": 6.805555555555556e-07, "loss": 0.0029, "num_tokens": 2618474.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 162.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.6037694215774536, "kl": 0.11343536525964737, "learning_rate": 6.800000000000001e-07, "loss": -0.1411, "num_tokens": 2618859.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 162.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.007935560308396816, "kl": 0.009158565197139978, "learning_rate": 6.794444444444445e-07, "loss": 0.0005, "num_tokens": 2619131.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 162.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.041869014501571655, "kl": 0.005446813069283962, "learning_rate": 6.78888888888889e-07, "loss": 0.0003, "num_tokens": 2619396.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 162.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 1.9631099700927734, "kl": 0.1389857456088066, "learning_rate": 6.783333333333333e-07, "loss": -0.0178, "num_tokens": 2619765.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 162.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.060046833008527756, "kl": 0.02924626087769866, "learning_rate": 6.777777777777779e-07, "loss": 0.0015, "num_tokens": 2620132.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 162.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.008507728576660156, "kl": 0.0008487709856126457, "learning_rate": 6.772222222222223e-07, "loss": 0.0, "num_tokens": 2620446.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 162.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.4170925617218018, "kl": 0.09740586578845978, "learning_rate": 6.766666666666667e-07, "loss": 0.0315, "num_tokens": 2620743.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 162.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.04950767755508423, "kl": 0.0029548684833571315, "learning_rate": 6.761111111111112e-07, "loss": 0.0002, "num_tokens": 2621005.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 162.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 3.180262865498662e-05, "kl": 3.993511199951172e-06, "learning_rate": 6.755555555555555e-07, "loss": 0.0, "num_tokens": 2621225.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 162.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04505655914545059, "kl": 0.1020134910941124, "learning_rate": 6.750000000000001e-07, "loss": 0.0051, "num_tokens": 2621569.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 162.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.16127479076385498, "kl": 0.04465607739984989, "learning_rate": 6.744444444444446e-07, "loss": 0.0021, "num_tokens": 2621900.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 162.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 1.7048991918563843, "kl": 0.18394062668085098, "learning_rate": 6.738888888888889e-07, "loss": -0.0229, "num_tokens": 2622365.0, "reward": 1.1749999523162842, "reward_std": 3.6499998569488525, "rewards/reward_combined/mean": 1.1749999523162842, "rewards/reward_combined/std": 3.6500003337860107, "step": 8788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 162.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.015931962057948112, "kl": 0.0013441101182252169, "learning_rate": 6.733333333333334e-07, "loss": 0.0001, "num_tokens": 2622630.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.25, "completions/mean_terminated_length": 3.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 162.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 15.075922966003418, "kl": 0.17687086015939713, "learning_rate": 6.727777777777778e-07, "loss": -0.0456, "num_tokens": 2622839.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 8790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 162.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.003680346067994833, "kl": 0.0002187803474953398, "learning_rate": 6.722222222222223e-07, "loss": 0.0, "num_tokens": 2623093.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 162.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.013375685550272465, "kl": 0.0032652616500854492, "learning_rate": 6.716666666666668e-07, "loss": 0.0002, "num_tokens": 2623353.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 162.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0910872295498848, "kl": 0.004782582924235612, "learning_rate": 6.711111111111111e-07, "loss": 0.0003, "num_tokens": 2623575.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 162.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.014686145819723606, "kl": 0.00813616905361414, "learning_rate": 6.705555555555556e-07, "loss": 0.0004, "num_tokens": 2623887.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 162.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.015283973887562752, "kl": 0.002582069195341319, "learning_rate": 6.7e-07, "loss": 0.0001, "num_tokens": 2624166.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 162.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.07085031270980835, "kl": 0.016940576722845435, "learning_rate": 6.694444444444445e-07, "loss": 0.0009, "num_tokens": 2624434.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 162.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.12261060625314713, "kl": 0.025272822938859463, "learning_rate": 6.68888888888889e-07, "loss": 0.0013, "num_tokens": 2624695.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 162.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.021437868475914, "kl": 0.002713362337090075, "learning_rate": 6.683333333333333e-07, "loss": 0.0001, "num_tokens": 2624999.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.25, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 162.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 1.6876437664031982, "kl": 0.024638233706355095, "learning_rate": 6.677777777777779e-07, "loss": 0.0036, "num_tokens": 2625396.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 162.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.21910731494426727, "kl": 0.1894092857837677, "learning_rate": 6.672222222222224e-07, "loss": 0.0091, "num_tokens": 2625721.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 162.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06713271141052246, "kl": 0.01723010139539838, "learning_rate": 6.666666666666667e-07, "loss": 0.0007, "num_tokens": 2626041.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 163.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.022998487576842308, "kl": 0.006565834395587444, "learning_rate": 6.661111111111112e-07, "loss": 0.0003, "num_tokens": 2626348.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.024893011897802353, "kl": 0.001575132948346436, "learning_rate": 6.655555555555556e-07, "loss": 0.0001, "num_tokens": 2626613.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 163.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06034619361162186, "kl": 0.11613167822360992, "learning_rate": 6.650000000000001e-07, "loss": 0.0059, "num_tokens": 2626936.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 163.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.01461832132190466, "kl": 0.008130977861583233, "learning_rate": 6.644444444444446e-07, "loss": 0.0004, "num_tokens": 2627248.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.015625, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 163.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.5647292137145996, "kl": 0.08545712009072304, "learning_rate": 6.638888888888889e-07, "loss": 0.1542, "num_tokens": 2627541.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 8806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 163.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03471585735678673, "kl": 0.01315001305192709, "learning_rate": 6.633333333333334e-07, "loss": 0.0006, "num_tokens": 2627838.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 163.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.010058511048555374, "kl": 0.0016051511629484594, "learning_rate": 6.627777777777778e-07, "loss": 0.0001, "num_tokens": 2628160.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 163.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.5877132415771484, "kl": 0.09444498270750046, "learning_rate": 6.622222222222223e-07, "loss": 0.0682, "num_tokens": 2628571.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 163.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011174790561199188, "kl": 0.008390344679355621, "learning_rate": 6.616666666666668e-07, "loss": 0.0004, "num_tokens": 2628807.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 163.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.49141788482666, "kl": 0.2284712612745352, "learning_rate": 6.611111111111111e-07, "loss": 0.1007, "num_tokens": 2629029.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 163.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 4.8045632865978405e-05, "kl": 4.507601261138916e-06, "learning_rate": 6.605555555555557e-07, "loss": 0.0, "num_tokens": 2629249.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 163.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.8640691041946411, "kl": 0.17720740288496017, "learning_rate": 6.6e-07, "loss": 0.0093, "num_tokens": 2629579.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 163.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.020174924284219742, "kl": 0.0011127896432299167, "learning_rate": 6.594444444444445e-07, "loss": 0.0001, "num_tokens": 2629814.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 163.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947631374001503, "kl": 0.031126005575060844, "learning_rate": 6.58888888888889e-07, "loss": 0.0016, "num_tokens": 2630049.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 163.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.09961453825235367, "kl": 0.005034089088439941, "learning_rate": 6.583333333333333e-07, "loss": 0.0003, "num_tokens": 2630261.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.388460874557495, "kl": 0.03184870025143027, "learning_rate": 6.577777777777779e-07, "loss": 0.2025, "num_tokens": 2630543.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 163.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.046056732535362244, "kl": 0.017550567165017128, "learning_rate": 6.572222222222223e-07, "loss": 0.0009, "num_tokens": 2630884.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.2895331382751465, "kl": 0.03568828967399895, "learning_rate": 6.566666666666667e-07, "loss": 0.0173, "num_tokens": 2631158.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 163.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.07526985555887222, "kl": 0.026213600300252438, "learning_rate": 6.561111111111112e-07, "loss": 0.0013, "num_tokens": 2631460.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 163.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.012258783914148808, "kl": 0.23929350823163986, "learning_rate": 6.555555555555556e-07, "loss": 0.0119, "num_tokens": 2631760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 163.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.07069263607263565, "kl": 0.07807424291968346, "learning_rate": 6.550000000000001e-07, "loss": 0.0039, "num_tokens": 2632108.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 163.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02029535546898842, "kl": 0.007067184429615736, "learning_rate": 6.544444444444445e-07, "loss": 0.0004, "num_tokens": 2632396.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 163.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.019321251660585403, "kl": 0.0062633649504277855, "learning_rate": 6.538888888888889e-07, "loss": 0.0003, "num_tokens": 2632687.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 163.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0538581945002079, "kl": 0.00777953362558037, "learning_rate": 6.533333333333334e-07, "loss": 0.0004, "num_tokens": 2633007.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 163.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.00613643042743206, "kl": 0.00015850961062824354, "learning_rate": 6.527777777777778e-07, "loss": 0.0, "num_tokens": 2633263.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 163.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05712060630321503, "kl": 0.17062976956367493, "learning_rate": 6.522222222222223e-07, "loss": 0.0085, "num_tokens": 2633547.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07113651186227798, "kl": 0.028790588257834315, "learning_rate": 6.516666666666667e-07, "loss": 0.0016, "num_tokens": 2633845.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 163.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.565160751342773, "kl": 0.191266730427742, "learning_rate": 6.511111111111111e-07, "loss": 0.0238, "num_tokens": 2634159.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 163.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.0677947998046875, "kl": 0.065806336235255, "learning_rate": 6.505555555555557e-07, "loss": 0.0348, "num_tokens": 2634482.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 163.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 8.702807426452637, "kl": 0.10654794424772263, "learning_rate": 6.5e-07, "loss": 0.0837, "num_tokens": 2634748.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 8831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 163.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.5234564542770386, "kl": 0.1735900342464447, "learning_rate": 6.494444444444445e-07, "loss": 0.0088, "num_tokens": 2635116.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 163.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.4183052778244019, "kl": 0.9442747831344604, "learning_rate": 6.488888888888889e-07, "loss": -0.0124, "num_tokens": 2635397.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 163.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.007622723933309317, "kl": 0.0011818557977676392, "learning_rate": 6.483333333333334e-07, "loss": 0.0001, "num_tokens": 2635609.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 163.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04299071058630943, "kl": 0.019699912518262863, "learning_rate": 6.477777777777779e-07, "loss": 0.001, "num_tokens": 2635854.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 83.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 83.25, "completions/mean_terminated_length": 25.666667938232422, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 163.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 2.48755145072937, "kl": 0.34894588962197304, "learning_rate": 6.472222222222223e-07, "loss": 0.3921, "num_tokens": 2636407.0, "reward": 5.550000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 5.550000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 8836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 163.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.029933476820588112, "kl": 0.0028085856465622783, "learning_rate": 6.466666666666667e-07, "loss": 0.0001, "num_tokens": 2636667.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 163.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.04794839769601822, "kl": 0.1407208014279604, "learning_rate": 6.461111111111111e-07, "loss": 0.0072, "num_tokens": 2637007.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 163.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.07072728872299194, "kl": 0.049346674233675, "learning_rate": 6.455555555555556e-07, "loss": 0.0025, "num_tokens": 2637479.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 163.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.3198639154434204, "kl": 0.048287924844771624, "learning_rate": 6.450000000000001e-07, "loss": 0.0029, "num_tokens": 2637754.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.011873195879161358, "kl": 0.002588329487480223, "learning_rate": 6.444444444444445e-07, "loss": 0.0001, "num_tokens": 2638036.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 163.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03265048936009407, "kl": 0.002696860348805785, "learning_rate": 6.438888888888889e-07, "loss": 0.0001, "num_tokens": 2638298.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8842 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completion_length": 47.5, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 163.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 3.228426218032837, "kl": 0.11254844814538956, "learning_rate": 6.433333333333335e-07, "loss": 0.2718, "num_tokens": 2638712.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 8843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 163.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.002599625615403056, "kl": 0.2823063135147095, "learning_rate": 6.427777777777778e-07, "loss": 0.0141, "num_tokens": 2639000.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 163.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.762119770050049, "kl": 0.11466246098279953, "learning_rate": 6.422222222222223e-07, "loss": -0.0673, "num_tokens": 2639281.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.025927020236849785, "kl": 0.010090603493154049, "learning_rate": 6.416666666666667e-07, "loss": 0.0005, "num_tokens": 2639553.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 163.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.07873158901929855, "kl": 0.010050173266790807, "learning_rate": 6.411111111111111e-07, "loss": 0.0005, "num_tokens": 2639859.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 163.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.048263244330883026, "kl": 0.0014146193861961365, "learning_rate": 6.405555555555557e-07, "loss": 0.0001, "num_tokens": 2640071.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 163.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.02049924060702324, "kl": 0.01942627690732479, "learning_rate": 6.4e-07, "loss": 0.001, "num_tokens": 2640359.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 163.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06096866354346275, "kl": 0.01745960395783186, "learning_rate": 6.394444444444445e-07, "loss": 0.0008, "num_tokens": 2640691.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 163.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.018955597653985023, "kl": 0.014180365949869156, "learning_rate": 6.388888888888889e-07, "loss": 0.0007, "num_tokens": 2640951.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 163.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.008010564371943474, "kl": 0.0032007135450839996, "learning_rate": 6.383333333333334e-07, "loss": 0.0002, "num_tokens": 2641211.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 163.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.10426022857427597, "kl": 0.10219886153936386, "learning_rate": 6.377777777777779e-07, "loss": 0.0051, "num_tokens": 2641608.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 163.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.5014207363128662, "kl": 0.23293668404221535, "learning_rate": 6.372222222222223e-07, "loss": -0.0356, "num_tokens": 2641970.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 163.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.027170857414603233, "kl": 0.0045069955522194505, "learning_rate": 6.366666666666667e-07, "loss": 0.0002, "num_tokens": 2642278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 164.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.8775160312652588, "kl": 0.6749255657196045, "learning_rate": 6.361111111111111e-07, "loss": 0.1324, "num_tokens": 2642557.0, "reward": 5.5, "reward_std": 5.0, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 5.0, "step": 8856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 164.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01895551010966301, "kl": 0.00040548891411162913, "learning_rate": 6.355555555555556e-07, "loss": 0.0, "num_tokens": 2642770.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 164.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 4.641910076141357, "kl": 0.42594147473573685, "learning_rate": 6.350000000000001e-07, "loss": 0.0226, "num_tokens": 2643085.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 164.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 0.7946110367774963, "kl": 0.1573222503066063, "learning_rate": 6.344444444444445e-07, "loss": -0.0986, "num_tokens": 2643543.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 8859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 164.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.4643442630767822, "kl": 0.019218144938349724, "learning_rate": 6.338888888888889e-07, "loss": 0.4567, "num_tokens": 2643914.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 8860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 164.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04422682151198387, "kl": 0.014620989561080933, "learning_rate": 6.333333333333334e-07, "loss": 0.0007, "num_tokens": 2644182.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 164.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.011100633069872856, "kl": 0.008329346776008606, "learning_rate": 6.327777777777778e-07, "loss": 0.0004, "num_tokens": 2644418.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 164.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.026323916390538216, "kl": 0.006421791855245829, "learning_rate": 6.322222222222223e-07, "loss": 0.0003, "num_tokens": 2644693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 164.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04179736226797104, "kl": 0.003357184585183859, "learning_rate": 6.316666666666667e-07, "loss": 0.0002, "num_tokens": 2644942.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 4.107403755187988, "kl": 0.47092482447624207, "learning_rate": 6.311111111111112e-07, "loss": -0.01, "num_tokens": 2645224.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 164.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 3.495720386505127, "kl": 0.13364174962043762, "learning_rate": 6.305555555555556e-07, "loss": 0.0695, "num_tokens": 2645569.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 164.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.018126703798770905, "kl": 0.004716446273960173, "learning_rate": 6.3e-07, "loss": 0.0002, "num_tokens": 2645899.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 164.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.09209167212247849, "kl": 0.010045178234577179, "learning_rate": 6.294444444444445e-07, "loss": 0.0004, "num_tokens": 2646107.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 164.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 4.315784931182861, "kl": 0.4663703516125679, "learning_rate": 6.288888888888889e-07, "loss": 0.0241, "num_tokens": 2646469.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 164.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.021813053637742996, "kl": 0.011015607044100761, "learning_rate": 6.283333333333334e-07, "loss": 0.0006, "num_tokens": 2646781.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 164.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02582646906375885, "kl": 0.001757207966875285, "learning_rate": 6.277777777777778e-07, "loss": 0.0001, "num_tokens": 2647046.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 164.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03823590278625488, "kl": 0.003373673069290817, "learning_rate": 6.272222222222223e-07, "loss": 0.0002, "num_tokens": 2647313.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 164.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.034017112106084824, "kl": 0.0029902359237894416, "learning_rate": 6.266666666666667e-07, "loss": 0.0002, "num_tokens": 2647571.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 164.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.006210764870047569, "kl": 0.22633343935012817, "learning_rate": 6.261111111111112e-07, "loss": 0.0113, "num_tokens": 2647873.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 164.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0389096662402153, "kl": 0.03394784778356552, "learning_rate": 6.255555555555556e-07, "loss": 0.0017, "num_tokens": 2648227.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 164.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.011636913754045963, "kl": 0.004318037070333958, "learning_rate": 6.25e-07, "loss": 0.0002, "num_tokens": 2648515.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 164.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02601213939487934, "kl": 0.17429929971694946, "learning_rate": 6.244444444444445e-07, "loss": 0.0087, "num_tokens": 2648824.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 164.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.05094074457883835, "kl": 0.014730165712535381, "learning_rate": 6.23888888888889e-07, "loss": 0.0007, "num_tokens": 2649118.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 164.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.025140071287751198, "kl": 0.002886968431994319, "learning_rate": 6.233333333333333e-07, "loss": 0.0001, "num_tokens": 2649394.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 164.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0371200367808342, "kl": 0.007879509124904871, "learning_rate": 6.227777777777778e-07, "loss": 0.0004, "num_tokens": 2649666.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 164.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.04077239707112312, "kl": 0.0020401753718033433, "learning_rate": 6.222222222222223e-07, "loss": 0.0001, "num_tokens": 2649984.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 164.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 1.2741747923428193e-05, "kl": 4.135072231292725e-06, "learning_rate": 6.216666666666667e-07, "loss": 0.0, "num_tokens": 2650204.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 164.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.038490068167448044, "kl": 0.002571508288383484, "learning_rate": 6.211111111111112e-07, "loss": 0.0001, "num_tokens": 2650416.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 24.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 164.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05243133381009102, "kl": 0.05564923584461212, "learning_rate": 6.205555555555556e-07, "loss": 0.0025, "num_tokens": 2650743.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 164.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.023631799966096878, "kl": 0.005514675984159112, "learning_rate": 6.200000000000001e-07, "loss": 0.0003, "num_tokens": 2651048.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 164.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 9.015396118164062, "kl": 0.04829757288098335, "learning_rate": 6.194444444444446e-07, "loss": 0.1975, "num_tokens": 2651320.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 8886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 164.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.14178547263145447, "kl": 0.0937521755695343, "learning_rate": 6.188888888888889e-07, "loss": 0.0047, "num_tokens": 2651714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8887 }, { "clip_ratio/high_max": 0.008064515888690948, "clip_ratio/high_mean": 0.008064515888690948, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008064515888690948, "completion_length": 26.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 26.5, "completions/mean_terminated_length": 26.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 164.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 3.1514036655426025, "kl": 0.034531977493315935, "learning_rate": 6.183333333333334e-07, "loss": 0.1811, "num_tokens": 2652072.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01171795092523098, "kl": 0.02156050316989422, "learning_rate": 6.177777777777778e-07, "loss": 0.0011, "num_tokens": 2652363.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 164.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.038538601249456406, "kl": 0.0026711999671533704, "learning_rate": 6.172222222222223e-07, "loss": 0.0001, "num_tokens": 2652582.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 164.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04945401847362518, "kl": 0.2839692682027817, "learning_rate": 6.166666666666668e-07, "loss": 0.0142, "num_tokens": 2652870.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 164.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.17213483154773712, "kl": 0.016579593531787395, "learning_rate": 6.161111111111111e-07, "loss": 0.001, "num_tokens": 2653142.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 164.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.8364965915679932, "kl": 0.26135364174842834, "learning_rate": 6.155555555555556e-07, "loss": 0.0138, "num_tokens": 2653432.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 164.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 1.7771413326263428, "kl": 0.12301535904407501, "learning_rate": 6.15e-07, "loss": 0.007, "num_tokens": 2653772.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 164.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.1289510279893875, "kl": 0.05976126901805401, "learning_rate": 6.144444444444445e-07, "loss": 0.0028, "num_tokens": 2654111.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 164.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04972969368100166, "kl": 0.006709648878313601, "learning_rate": 6.13888888888889e-07, "loss": 0.0003, "num_tokens": 2654437.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 8.019563674926758, "kl": 0.0285136898746714, "learning_rate": 6.133333333333333e-07, "loss": 0.0218, "num_tokens": 2654722.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 8897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 164.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09970246255397797, "kl": 0.009893432259559631, "learning_rate": 6.127777777777778e-07, "loss": 0.0005, "num_tokens": 2654982.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.9103167057037354, "kl": 0.08841741626383737, "learning_rate": 6.122222222222222e-07, "loss": 0.005, "num_tokens": 2655284.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 164.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.025664597749710083, "kl": 0.00862408196553588, "learning_rate": 6.116666666666667e-07, "loss": 0.0004, "num_tokens": 2655602.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 164.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.013568569906055927, "kl": 0.0268426313996315, "learning_rate": 6.111111111111112e-07, "loss": 0.0013, "num_tokens": 2655818.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 164.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.1738851070404053, "kl": 0.0999598503112793, "learning_rate": 6.105555555555556e-07, "loss": 0.072, "num_tokens": 2656182.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 8902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 164.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.418299436569214, "kl": 0.4003163352608681, "learning_rate": 6.100000000000001e-07, "loss": -0.0487, "num_tokens": 2656506.0, "reward": 4.0, "reward_std": 2.915475845336914, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 2.915475845336914, "step": 8903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 164.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 0.7996443510055542, "kl": 0.4009246900677681, "learning_rate": 6.094444444444446e-07, "loss": -0.1242, "num_tokens": 2656931.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 164.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.007785635069012642, "kl": 0.00016779005818534642, "learning_rate": 6.088888888888889e-07, "loss": 0.0, "num_tokens": 2657187.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 164.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.018316807225346565, "kl": 0.0030442768475040793, "learning_rate": 6.083333333333334e-07, "loss": 0.0001, "num_tokens": 2657457.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 164.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 4.112386703491211, "kl": 0.014781714417040348, "learning_rate": 6.077777777777778e-07, "loss": 0.0359, "num_tokens": 2657782.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 164.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.06429889053106308, "kl": 0.023662084713578224, "learning_rate": 6.072222222222223e-07, "loss": 0.0015, "num_tokens": 2658085.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 164.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06547588109970093, "kl": 0.02290950482711196, "learning_rate": 6.066666666666668e-07, "loss": 0.0011, "num_tokens": 2658374.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.80747652053833, "kl": 0.05730860633775592, "learning_rate": 6.061111111111111e-07, "loss": 0.0375, "num_tokens": 2658656.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009615384973585606, "clip_ratio/low_min": 0.009615384973585606, "clip_ratio/region_mean": 0.009615384973585606, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 165.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.419900417327881, "kl": 0.15891794860363007, "learning_rate": 6.055555555555556e-07, "loss": 0.0972, "num_tokens": 2658974.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 8911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 165.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05060994252562523, "kl": 0.008509133942425251, "learning_rate": 6.05e-07, "loss": 0.0004, "num_tokens": 2659246.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 165.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03984348848462105, "kl": 0.15263860672712326, "learning_rate": 6.044444444444445e-07, "loss": 0.0076, "num_tokens": 2659556.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 82.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 82.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 165.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.4487740993499756, "kl": 0.1521749049425125, "learning_rate": 6.03888888888889e-07, "loss": 0.2894, "num_tokens": 2660104.0, "reward": 4.300000190734863, "reward_std": 4.284857273101807, "rewards/reward_combined/mean": 4.300000190734863, "rewards/reward_combined/std": 4.284857273101807, "step": 8914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.060994453728199005, "kl": 0.005441172979772091, "learning_rate": 6.033333333333334e-07, "loss": 0.0003, "num_tokens": 2660404.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 80.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 80.5, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 165.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.364551067352295, "kl": 0.014067767653614283, "learning_rate": 6.027777777777778e-07, "loss": 0.4205, "num_tokens": 2660978.0, "reward": 5.050000190734863, "reward_std": 5.900000095367432, "rewards/reward_combined/mean": 5.050000190734863, "rewards/reward_combined/std": 5.90000057220459, "step": 8916 }, { "clip_ratio/high_max": 0.009999999776482582, "clip_ratio/high_mean": 0.009999999776482582, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009999999776482582, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 165.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.123881816864014, "kl": 0.44878775626420975, "learning_rate": 6.022222222222223e-07, "loss": 0.1865, "num_tokens": 2661285.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 8917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 165.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.09102547913789749, "kl": 0.01619499863591045, "learning_rate": 6.016666666666667e-07, "loss": 0.0009, "num_tokens": 2661547.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 165.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.717793583869934, "kl": 0.06117686256766319, "learning_rate": 6.011111111111112e-07, "loss": 0.0775, "num_tokens": 2662005.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 8919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 165.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05199921131134033, "kl": 0.019568225368857384, "learning_rate": 6.005555555555556e-07, "loss": 0.001, "num_tokens": 2662250.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 165.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.008782343938946724, "kl": 0.00017252564430236816, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "num_tokens": 2662506.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 165.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02249167487025261, "kl": 0.007163364673033357, "learning_rate": 5.994444444444445e-07, "loss": 0.0004, "num_tokens": 2662824.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 72.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 72.25, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 165.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 1.6035109758377075, "kl": 0.08344904705882072, "learning_rate": 5.988888888888889e-07, "loss": 0.464, "num_tokens": 2663341.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 8923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 165.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.006998514756560326, "kl": 0.21367629617452621, "learning_rate": 5.983333333333334e-07, "loss": 0.0107, "num_tokens": 2663645.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 165.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.5704665184021, "kl": 0.12809665803797543, "learning_rate": 5.977777777777778e-07, "loss": -0.0646, "num_tokens": 2663973.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 8925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 165.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.009629921056330204, "kl": 0.002119035110808909, "learning_rate": 5.972222222222223e-07, "loss": 0.0001, "num_tokens": 2664250.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 165.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.0388152599334717, "kl": 0.12588120996952057, "learning_rate": 5.966666666666667e-07, "loss": 0.0615, "num_tokens": 2664635.0, "reward": 5.625, "reward_std": 4.422951698303223, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 4.422951698303223, "step": 8927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.042380403727293015, "kl": 0.01573463249951601, "learning_rate": 5.961111111111111e-07, "loss": 0.0008, "num_tokens": 2664923.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 165.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.017850182950496674, "kl": 0.00031774042872712016, "learning_rate": 5.955555555555556e-07, "loss": 0.0, "num_tokens": 2665136.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 165.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.025608854368329048, "kl": 0.0017061326652765274, "learning_rate": 5.95e-07, "loss": 0.0001, "num_tokens": 2665400.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 165.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.09308130294084549, "kl": 0.13528945297002792, "learning_rate": 5.944444444444445e-07, "loss": 0.0068, "num_tokens": 2665742.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 165.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.01513142790645361, "kl": 0.03161985706537962, "learning_rate": 5.938888888888889e-07, "loss": 0.0016, "num_tokens": 2666037.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 165.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.18023961782455444, "kl": 0.282998189330101, "learning_rate": 5.933333333333334e-07, "loss": 0.0141, "num_tokens": 2666327.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 165.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.1227315217256546, "kl": 0.0197903651278466, "learning_rate": 5.927777777777778e-07, "loss": 0.001, "num_tokens": 2666657.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 165.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01171924825757742, "kl": 0.06962549686431885, "learning_rate": 5.922222222222223e-07, "loss": 0.0035, "num_tokens": 2667105.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 165.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048730396665632725, "kl": 0.0008743822399992496, "learning_rate": 5.916666666666667e-07, "loss": 0.0, "num_tokens": 2667325.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 165.5, "frac_reward_zero_std": 1.0, "grad_norm": 3.3254073059652e-05, "kl": 4.969537258148193e-06, "learning_rate": 5.911111111111111e-07, "loss": 0.0, "num_tokens": 2667545.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 8937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 165.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.26692068576812744, "kl": 0.02886806766036898, "learning_rate": 5.905555555555556e-07, "loss": 0.0014, "num_tokens": 2667847.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 165.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.02074565552175045, "kl": 0.0024566863430663943, "learning_rate": 5.900000000000001e-07, "loss": 0.0001, "num_tokens": 2668083.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 165.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.029683172702789307, "kl": 0.004636108875274658, "learning_rate": 5.894444444444445e-07, "loss": 0.0003, "num_tokens": 2668291.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 165.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.46341171860694885, "kl": 0.05527578294277191, "learning_rate": 5.888888888888889e-07, "loss": 0.0039, "num_tokens": 2668572.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 79.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 79.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 165.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 2.2190704345703125, "kl": 0.07641168311238289, "learning_rate": 5.883333333333333e-07, "loss": 0.4466, "num_tokens": 2669108.0, "reward": 2.375, "reward_std": 3.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 3.25, "step": 8942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 165.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.581693410873413, "kl": 0.0938921757042408, "learning_rate": 5.877777777777778e-07, "loss": 0.0424, "num_tokens": 2669460.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0503503642976284, "kl": 0.006272107362747192, "learning_rate": 5.872222222222223e-07, "loss": 0.0003, "num_tokens": 2669744.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.042709700763225555, "kl": 0.013707171194255352, "learning_rate": 5.866666666666667e-07, "loss": 0.0007, "num_tokens": 2670038.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 165.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.015440602786839008, "kl": 0.003923032898455858, "learning_rate": 5.861111111111112e-07, "loss": 0.0002, "num_tokens": 2670304.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 165.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10415926575660706, "kl": 0.01917477184906602, "learning_rate": 5.855555555555555e-07, "loss": 0.001, "num_tokens": 2670589.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 165.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015079461736604571, "kl": 0.0013607144355773926, "learning_rate": 5.850000000000001e-07, "loss": 0.0001, "num_tokens": 2670905.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 165.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.019916335120797157, "kl": 0.01905258186161518, "learning_rate": 5.844444444444445e-07, "loss": 0.001, "num_tokens": 2671173.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 165.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.011373116634786129, "kl": 0.007877212949097157, "learning_rate": 5.838888888888889e-07, "loss": 0.0004, "num_tokens": 2671485.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 165.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.017768116667866707, "kl": 0.001548144209664315, "learning_rate": 5.833333333333334e-07, "loss": 0.0001, "num_tokens": 2671747.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 165.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.013256045058369637, "kl": 0.0032128170132637024, "learning_rate": 5.827777777777778e-07, "loss": 0.0002, "num_tokens": 2672007.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 165.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.011232771910727024, "kl": 0.008266910910606384, "learning_rate": 5.822222222222223e-07, "loss": 0.0004, "num_tokens": 2672243.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 165.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.8214797973632812, "kl": 0.39243993163108826, "learning_rate": 5.816666666666667e-07, "loss": 0.2478, "num_tokens": 2672581.0, "reward": 3.5, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 4.690415859222412, "step": 8954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 165.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.008116938173770905, "kl": 0.0011739582405425608, "learning_rate": 5.811111111111111e-07, "loss": 0.0001, "num_tokens": 2672893.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 165.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00748162716627121, "kl": 0.0011065155267715454, "learning_rate": 5.805555555555556e-07, "loss": 0.0001, "num_tokens": 2673105.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8956 }, { "clip_ratio/high_max": 0.006666666828095913, "clip_ratio/high_mean": 0.006666666828095913, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006666666828095913, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 165.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.5612223148345947, "kl": 0.09126797318458557, "learning_rate": 5.800000000000001e-07, "loss": 0.04, "num_tokens": 2673480.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 8957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 165.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.021906593814492226, "kl": 0.026784121990203857, "learning_rate": 5.794444444444445e-07, "loss": 0.0013, "num_tokens": 2673696.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 165.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.01854870654642582, "kl": 0.014304588548839092, "learning_rate": 5.788888888888889e-07, "loss": 0.0007, "num_tokens": 2673956.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 165.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 2.2771315574645996, "kl": 0.23418357595801353, "learning_rate": 5.783333333333333e-07, "loss": -0.061, "num_tokens": 2674337.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 8960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 165.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 3.3455238342285156, "kl": 0.1914447396993637, "learning_rate": 5.777777777777778e-07, "loss": 0.1537, "num_tokens": 2674657.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.75, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 165.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.11260408908128738, "kl": 0.04851921275258064, "learning_rate": 5.772222222222223e-07, "loss": 0.0019, "num_tokens": 2675060.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 165.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02604684978723526, "kl": 0.0048858115915209055, "learning_rate": 5.766666666666667e-07, "loss": 0.0002, "num_tokens": 2675350.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 166.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.5303800106048584, "kl": 0.4525347054004669, "learning_rate": 5.761111111111112e-07, "loss": -0.0324, "num_tokens": 2675631.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 166.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 4.357170104980469, "kl": 0.4773280620574951, "learning_rate": 5.755555555555555e-07, "loss": -0.0253, "num_tokens": 2675912.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 8965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 166.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07415597140789032, "kl": 0.017702126875519753, "learning_rate": 5.750000000000001e-07, "loss": 0.0009, "num_tokens": 2676201.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 166.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.1052088737487793, "kl": 0.23821000009775162, "learning_rate": 5.744444444444445e-07, "loss": 0.0893, "num_tokens": 2676569.0, "reward": 4.125, "reward_std": 3.902456521987915, "rewards/reward_combined/mean": 4.125, "rewards/reward_combined/std": 3.902456521987915, "step": 8967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 166.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.748945474624634, "kl": 0.11215350264683366, "learning_rate": 5.738888888888889e-07, "loss": -0.0346, "num_tokens": 2676847.0, "reward": 6.75, "reward_std": 2.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 2.5, "step": 8968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 166.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.015744009986519814, "kl": 0.0004684925079345703, "learning_rate": 5.733333333333334e-07, "loss": 0.0, "num_tokens": 2677059.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 166.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03199375793337822, "kl": 0.0007267326218425296, "learning_rate": 5.727777777777778e-07, "loss": 0.0, "num_tokens": 2677315.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 166.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02806198224425316, "kl": 0.008676875848323107, "learning_rate": 5.722222222222223e-07, "loss": 0.0004, "num_tokens": 2677643.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 166.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.035097233951091766, "kl": 0.004186241363640875, "learning_rate": 5.716666666666667e-07, "loss": 0.0002, "num_tokens": 2677969.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 166.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 6.098155975341797, "kl": 0.10714324563741684, "learning_rate": 5.711111111111111e-07, "loss": 0.2185, "num_tokens": 2678270.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 8973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.012322569265961647, "kl": 0.0029155617812648416, "learning_rate": 5.705555555555556e-07, "loss": 0.0001, "num_tokens": 2678552.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 166.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.05549256503582001, "kl": 0.007678136578761041, "learning_rate": 5.7e-07, "loss": 0.0004, "num_tokens": 2678868.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 166.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03614320978522301, "kl": 0.027033647522330284, "learning_rate": 5.694444444444445e-07, "loss": 0.0014, "num_tokens": 2679087.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 166.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.026006443426012993, "kl": 0.002435557544231415, "learning_rate": 5.68888888888889e-07, "loss": 0.0001, "num_tokens": 2679295.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 166.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.05474280193448067, "kl": 0.1740742027759552, "learning_rate": 5.683333333333333e-07, "loss": 0.0087, "num_tokens": 2679579.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 166.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.14236114919185638, "kl": 0.028936564922332764, "learning_rate": 5.677777777777779e-07, "loss": 0.0014, "num_tokens": 2679922.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.056351128965616226, "kl": 0.012129681184887886, "learning_rate": 5.672222222222223e-07, "loss": 0.0006, "num_tokens": 2680195.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 38.25, "completions/mean_terminated_length": 38.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 166.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.02222263067960739, "kl": 0.08597204089164734, "learning_rate": 5.666666666666667e-07, "loss": 0.0041, "num_tokens": 2680616.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 166.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.245720386505127, "kl": 0.12205914035439491, "learning_rate": 5.661111111111112e-07, "loss": 0.2112, "num_tokens": 2681009.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 8982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 166.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.32720947265625, "kl": 0.07597795128822327, "learning_rate": 5.655555555555555e-07, "loss": -0.0131, "num_tokens": 2681401.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 8983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 166.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.34734806418418884, "kl": 0.09969808161258698, "learning_rate": 5.650000000000001e-07, "loss": 0.0046, "num_tokens": 2681703.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 166.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.33089205622673035, "kl": 0.16922207921743393, "learning_rate": 5.644444444444445e-07, "loss": 0.0085, "num_tokens": 2682032.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 166.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.032168976962566376, "kl": 0.0036463551223278046, "learning_rate": 5.638888888888889e-07, "loss": 0.0002, "num_tokens": 2682276.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 166.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.040652524679899216, "kl": 0.003614056156948209, "learning_rate": 5.633333333333334e-07, "loss": 0.0002, "num_tokens": 2682540.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 166.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 1.7885606288909912, "kl": 0.024470129515975714, "learning_rate": 5.627777777777778e-07, "loss": 0.0256, "num_tokens": 2682842.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 8988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 166.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.14904683828353882, "kl": 0.023246537428349257, "learning_rate": 5.622222222222223e-07, "loss": 0.0012, "num_tokens": 2683132.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.015519066713750362, "kl": 0.006774484179913998, "learning_rate": 5.616666666666667e-07, "loss": 0.0003, "num_tokens": 2683419.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 166.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.16619208455085754, "kl": 0.030470986384898424, "learning_rate": 5.611111111111111e-07, "loss": 0.0016, "num_tokens": 2683709.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 166.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01965447887778282, "kl": 0.0016895026492420584, "learning_rate": 5.605555555555556e-07, "loss": 0.0001, "num_tokens": 2683963.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 166.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03419000282883644, "kl": 0.0733717754483223, "learning_rate": 5.6e-07, "loss": 0.0036, "num_tokens": 2684336.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 166.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02841791883111, "kl": 0.0071466523222625256, "learning_rate": 5.594444444444445e-07, "loss": 0.0004, "num_tokens": 2684654.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 166.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.01112205721437931, "kl": 0.008357919752597809, "learning_rate": 5.58888888888889e-07, "loss": 0.0004, "num_tokens": 2684890.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 8995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 166.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04369428753852844, "kl": 0.03450854029506445, "learning_rate": 5.583333333333333e-07, "loss": 0.0018, "num_tokens": 2685263.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.027009909972548485, "kl": 0.0014171039802022278, "learning_rate": 5.577777777777779e-07, "loss": 0.0001, "num_tokens": 2685524.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 8997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 166.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.32567647099494934, "kl": 0.08528128452599049, "learning_rate": 5.572222222222222e-07, "loss": 0.004, "num_tokens": 2685857.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 8998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 166.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.20181483030319214, "kl": 0.023690885864198208, "learning_rate": 5.566666666666667e-07, "loss": 0.0012, "num_tokens": 2686171.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 8999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 166.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0884907990694046, "kl": 0.2818532884120941, "learning_rate": 5.561111111111112e-07, "loss": 0.0141, "num_tokens": 2686460.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.11505831778049469, "kl": 0.021528981626033783, "learning_rate": 5.555555555555555e-07, "loss": 0.0011, "num_tokens": 2686766.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 166.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.34645333886146545, "kl": 0.2539060637354851, "learning_rate": 5.550000000000001e-07, "loss": 0.0128, "num_tokens": 2687073.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 166.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003821342543233186, "kl": 2.8304755687713623e-05, "learning_rate": 5.544444444444444e-07, "loss": 0.0, "num_tokens": 2687293.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 166.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.028027160093188286, "kl": 0.171719491481781, "learning_rate": 5.538888888888889e-07, "loss": 0.0086, "num_tokens": 2687603.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 166.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.017366481944918633, "kl": 0.0011959478142671287, "learning_rate": 5.533333333333334e-07, "loss": 0.0001, "num_tokens": 2687865.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 51.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 166.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.11167077720165253, "kl": 0.05421202816069126, "learning_rate": 5.527777777777778e-07, "loss": 0.0028, "num_tokens": 2688349.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 166.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.029229743406176567, "kl": 0.0015880082501098514, "learning_rate": 5.522222222222223e-07, "loss": 0.0001, "num_tokens": 2688583.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 166.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009379505063407123, "kl": 0.0016856364673003554, "learning_rate": 5.516666666666667e-07, "loss": 0.0001, "num_tokens": 2688863.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 166.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.09602802246809006, "kl": 0.09234890341758728, "learning_rate": 5.511111111111111e-07, "loss": 0.0046, "num_tokens": 2689185.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 166.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01485492940992117, "kl": 0.003499099286273122, "learning_rate": 5.505555555555557e-07, "loss": 0.0002, "num_tokens": 2689487.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 166.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.11996573954820633, "kl": 0.01596884010359645, "learning_rate": 5.5e-07, "loss": 0.0008, "num_tokens": 2689749.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 166.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.08970341831445694, "kl": 0.019633106887340546, "learning_rate": 5.494444444444445e-07, "loss": 0.001, "num_tokens": 2690123.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 166.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0043350569903850555, "kl": 0.0030260570347309113, "learning_rate": 5.48888888888889e-07, "loss": 0.0002, "num_tokens": 2690383.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 166.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.00494323531165719, "kl": 0.0008447170257568359, "learning_rate": 5.483333333333333e-07, "loss": 0.0, "num_tokens": 2690603.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 166.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 8.714527130126953, "kl": 0.006014121696352959, "learning_rate": 5.477777777777779e-07, "loss": 0.3826, "num_tokens": 2690829.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 166.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 3.3544039726257324, "kl": 0.1337018571794033, "learning_rate": 5.472222222222222e-07, "loss": 0.0593, "num_tokens": 2691108.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 166.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03234381228685379, "kl": 0.016726797446608543, "learning_rate": 5.466666666666667e-07, "loss": 0.0008, "num_tokens": 2691376.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 167.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.020904259756207466, "kl": 0.008693670388311148, "learning_rate": 5.461111111111112e-07, "loss": 0.0004, "num_tokens": 2691712.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 167.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02662588655948639, "kl": 0.005055274581536651, "learning_rate": 5.455555555555556e-07, "loss": 0.0003, "num_tokens": 2692002.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 167.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.8700768947601318, "kl": 0.2349288985133171, "learning_rate": 5.450000000000001e-07, "loss": -0.0346, "num_tokens": 2692367.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 167.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 4.096656799316406, "kl": 0.08936465531587601, "learning_rate": 5.444444444444444e-07, "loss": 0.1431, "num_tokens": 2692686.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 167.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.005470545031130314, "kl": 0.2262200564146042, "learning_rate": 5.438888888888889e-07, "loss": 0.0113, "num_tokens": 2692988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 167.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.01983650214970112, "kl": 0.001667472650296986, "learning_rate": 5.433333333333334e-07, "loss": 0.0001, "num_tokens": 2693223.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 167.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.013052682392299175, "kl": 0.002096753567457199, "learning_rate": 5.427777777777778e-07, "loss": 0.0001, "num_tokens": 2693475.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 167.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.395554780960083, "kl": 0.07587513700127602, "learning_rate": 5.422222222222223e-07, "loss": 0.0019, "num_tokens": 2693795.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 9025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 167.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07347986847162247, "kl": 0.00531852530548349, "learning_rate": 5.416666666666667e-07, "loss": 0.0003, "num_tokens": 2694093.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 167.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.9912640452384949, "kl": 0.2921149283647537, "learning_rate": 5.411111111111111e-07, "loss": 0.0149, "num_tokens": 2694430.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 167.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021902553271502256, "kl": 0.2822519540786743, "learning_rate": 5.405555555555557e-07, "loss": 0.0141, "num_tokens": 2694718.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 167.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02019772119820118, "kl": 0.0049447359051555395, "learning_rate": 5.4e-07, "loss": 0.0002, "num_tokens": 2695018.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 167.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.049099359661340714, "kl": 0.012427820824086666, "learning_rate": 5.394444444444445e-07, "loss": 0.0006, "num_tokens": 2695290.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 167.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0898057296872139, "kl": 0.014718440535943955, "learning_rate": 5.388888888888889e-07, "loss": 0.0008, "num_tokens": 2695552.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 167.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.06687069684267044, "kl": 0.009678017115220428, "learning_rate": 5.383333333333333e-07, "loss": 0.0005, "num_tokens": 2695879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 167.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03621787205338478, "kl": 0.012050039134919643, "learning_rate": 5.377777777777779e-07, "loss": 0.0006, "num_tokens": 2696217.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 167.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.2296241968870163, "kl": 0.04520384594798088, "learning_rate": 5.372222222222222e-07, "loss": 0.0024, "num_tokens": 2696506.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 167.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.16745541989803314, "kl": 0.031291038263589144, "learning_rate": 5.366666666666667e-07, "loss": 0.0015, "num_tokens": 2696804.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9035 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completion_length": 47.5, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 167.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.4670498371124268, "kl": 0.18283680081367493, "learning_rate": 5.361111111111111e-07, "loss": 0.1446, "num_tokens": 2697274.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 9036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 167.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1110760048031807, "kl": 0.0247899005189538, "learning_rate": 5.355555555555556e-07, "loss": 0.0014, "num_tokens": 2697644.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 167.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 1.2674356698989868, "kl": 0.07819021493196487, "learning_rate": 5.350000000000001e-07, "loss": -0.1368, "num_tokens": 2698022.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 76.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 167.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 2.044985055923462, "kl": 0.07993892673403025, "learning_rate": 5.344444444444445e-07, "loss": 0.4554, "num_tokens": 2698546.0, "reward": 6.625, "reward_std": 1.75, "rewards/reward_combined/mean": 6.625, "rewards/reward_combined/std": 1.75, "step": 9039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 167.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04638337716460228, "kl": 0.17208465188741684, "learning_rate": 5.338888888888889e-07, "loss": 0.0086, "num_tokens": 2698857.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 167.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.007203652057796717, "kl": 0.001454070908948779, "learning_rate": 5.333333333333335e-07, "loss": 0.0001, "num_tokens": 2699179.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 167.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 1.7497165203094482, "kl": 0.07697578892111778, "learning_rate": 5.327777777777778e-07, "loss": -0.0206, "num_tokens": 2699591.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 167.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.18207426369190216, "kl": 0.1712627112865448, "learning_rate": 5.322222222222223e-07, "loss": 0.0086, "num_tokens": 2699879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 167.4814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.920523166656494, "kl": 0.5845605134963989, "learning_rate": 5.316666666666667e-07, "loss": 0.1197, "num_tokens": 2700146.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 167.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.006420596968382597, "kl": 0.0002308219627593644, "learning_rate": 5.311111111111111e-07, "loss": 0.0, "num_tokens": 2700402.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 167.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04503779113292694, "kl": 0.10447168024256825, "learning_rate": 5.305555555555557e-07, "loss": 0.0046, "num_tokens": 2700728.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 167.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.044514089822769165, "kl": 0.026829050853848457, "learning_rate": 5.3e-07, "loss": 0.0013, "num_tokens": 2700950.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003448275849223137, "clip_ratio/low_min": 0.003448275849223137, "clip_ratio/region_mean": 0.003448275849223137, "completion_length": 87.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 87.0, "completions/mean_terminated_length": 30.666667938232422, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 167.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 1.9638694524765015, "kl": 0.13031713664531708, "learning_rate": 5.294444444444445e-07, "loss": 0.4092, "num_tokens": 2701522.0, "reward": 4.625, "reward_std": 4.308422088623047, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 4.308422088623047, "step": 9048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 167.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.024857839569449425, "kl": 0.4453265219926834, "learning_rate": 5.288888888888889e-07, "loss": 0.0223, "num_tokens": 2701806.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 167.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 3.2464041709899902, "kl": 0.4952568393200636, "learning_rate": 5.283333333333334e-07, "loss": 0.1823, "num_tokens": 2702088.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 167.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 2.853250503540039, "kl": 0.1405719295144081, "learning_rate": 5.277777777777779e-07, "loss": -0.0659, "num_tokens": 2702454.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 9051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 167.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029734617564827204, "kl": 0.005475640296936035, "learning_rate": 5.272222222222222e-07, "loss": 0.0003, "num_tokens": 2702766.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9052 }, { "clip_ratio/high_max": 0.008620689623057842, "clip_ratio/high_mean": 0.008620689623057842, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008620689623057842, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 167.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.046206951141357, "kl": 0.06838009256171063, "learning_rate": 5.266666666666667e-07, "loss": 0.2929, "num_tokens": 2703060.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 167.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.01718645915389061, "kl": 0.003159894375130534, "learning_rate": 5.261111111111111e-07, "loss": 0.0002, "num_tokens": 2703344.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 167.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076040648855268955, "kl": 0.0014261752367019653, "learning_rate": 5.255555555555556e-07, "loss": 0.0001, "num_tokens": 2703556.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 167.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11980173736810684, "kl": 0.026854409836232662, "learning_rate": 5.250000000000001e-07, "loss": 0.0014, "num_tokens": 2703880.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 167.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07718062400817871, "kl": 0.011701012495905161, "learning_rate": 5.244444444444445e-07, "loss": 0.0006, "num_tokens": 2704151.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 167.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007284170133061707, "kl": 0.0016851446707732975, "learning_rate": 5.238888888888889e-07, "loss": 0.0001, "num_tokens": 2704431.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 167.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 3.2583047868683934e-05, "kl": 4.6193599700927734e-06, "learning_rate": 5.233333333333334e-07, "loss": 0.0, "num_tokens": 2704651.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 167.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01990668475627899, "kl": 0.003188241505995393, "learning_rate": 5.227777777777778e-07, "loss": 0.0002, "num_tokens": 2704953.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 167.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.16267764568328857, "kl": 0.027385008172132075, "learning_rate": 5.222222222222223e-07, "loss": 0.0013, "num_tokens": 2705262.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 167.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08095590770244598, "kl": 0.011527193535584956, "learning_rate": 5.216666666666667e-07, "loss": 0.0008, "num_tokens": 2705490.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 167.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0095047103241086, "kl": 0.031865211203694344, "learning_rate": 5.211111111111111e-07, "loss": 0.0016, "num_tokens": 2705782.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 167.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.026907894760370255, "kl": 0.03014919871930033, "learning_rate": 5.205555555555556e-07, "loss": 0.0015, "num_tokens": 2706074.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 167.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.025311075150966644, "kl": 0.002456091344356537, "learning_rate": 5.2e-07, "loss": 0.0001, "num_tokens": 2706282.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 167.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02649300917983055, "kl": 0.01994998101145029, "learning_rate": 5.194444444444445e-07, "loss": 0.001, "num_tokens": 2706627.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 167.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.023517826572060585, "kl": 0.004043449996970594, "learning_rate": 5.188888888888889e-07, "loss": 0.0002, "num_tokens": 2706893.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 167.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02421557903289795, "kl": 0.0005806609988212585, "learning_rate": 5.183333333333334e-07, "loss": 0.0, "num_tokens": 2707105.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 167.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.3903452754020691, "kl": 0.09564333409070969, "learning_rate": 5.177777777777778e-07, "loss": 0.0057, "num_tokens": 2707343.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 167.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.14180471003055573, "kl": 0.02460425952449441, "learning_rate": 5.172222222222223e-07, "loss": 0.0012, "num_tokens": 2707691.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 167.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06803836673498154, "kl": 0.004114177543669939, "learning_rate": 5.166666666666667e-07, "loss": 0.0002, "num_tokens": 2707934.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 168.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.023234793916344643, "kl": 0.01361490273848176, "learning_rate": 5.161111111111112e-07, "loss": 0.0007, "num_tokens": 2708194.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 168.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.1580348163843155, "kl": 0.056946590542793274, "learning_rate": 5.155555555555556e-07, "loss": 0.0027, "num_tokens": 2708498.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 168.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.006335123907774687, "kl": 0.0005557164549827576, "learning_rate": 5.15e-07, "loss": 0.0, "num_tokens": 2708742.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 168.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.031638987362384796, "kl": 0.0020457676146179438, "learning_rate": 5.144444444444445e-07, "loss": 0.0001, "num_tokens": 2708977.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 168.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.01682228222489357, "kl": 0.000282663109828718, "learning_rate": 5.138888888888889e-07, "loss": 0.0, "num_tokens": 2709190.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 168.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.031974561512470245, "kl": 0.0494921188801527, "learning_rate": 5.133333333333334e-07, "loss": 0.0025, "num_tokens": 2709644.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 168.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.062121033668518066, "kl": 0.17191477864980698, "learning_rate": 5.127777777777778e-07, "loss": 0.0086, "num_tokens": 2709928.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 168.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.008347327820956707, "kl": 0.02664577215909958, "learning_rate": 5.122222222222222e-07, "loss": 0.0013, "num_tokens": 2710144.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 168.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.012913959100842476, "kl": 0.007798187434673309, "learning_rate": 5.116666666666667e-07, "loss": 0.0004, "num_tokens": 2710380.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 168.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.006173583213239908, "kl": 0.0036875264486297965, "learning_rate": 5.111111111111112e-07, "loss": 0.0002, "num_tokens": 2710644.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 168.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.052064646035432816, "kl": 0.07700113579630852, "learning_rate": 5.105555555555556e-07, "loss": 0.0039, "num_tokens": 2711012.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 168.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.027374809607863426, "kl": 0.011204625479876995, "learning_rate": 5.1e-07, "loss": 0.0006, "num_tokens": 2711326.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 168.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02992325834929943, "kl": 0.004819957131985575, "learning_rate": 5.094444444444444e-07, "loss": 0.0002, "num_tokens": 2711614.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 168.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.07022213190793991, "kl": 0.2813033163547516, "learning_rate": 5.088888888888889e-07, "loss": 0.0141, "num_tokens": 2711903.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 168.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007202859851531684, "kl": 5.2072107791900635e-05, "learning_rate": 5.083333333333334e-07, "loss": 0.0, "num_tokens": 2712123.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 168.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.115480899810791, "kl": 0.12995484471321106, "learning_rate": 5.077777777777778e-07, "loss": 0.0943, "num_tokens": 2712435.0, "reward": 5.5, "reward_std": 5.0, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 5.0, "step": 9087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 168.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.6778056621551514, "kl": 0.0524611072614789, "learning_rate": 5.072222222222223e-07, "loss": 0.0956, "num_tokens": 2712774.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 168.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07320735603570938, "kl": 0.4251664876937866, "learning_rate": 5.066666666666667e-07, "loss": 0.0213, "num_tokens": 2713058.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 168.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.11968803405761719, "kl": 0.011722220573574305, "learning_rate": 5.061111111111112e-07, "loss": 0.0006, "num_tokens": 2713326.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 168.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.042431097477674484, "kl": 0.022317298396956176, "learning_rate": 5.055555555555556e-07, "loss": 0.0011, "num_tokens": 2713619.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 168.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.037795308977365494, "kl": 0.008265220560133457, "learning_rate": 5.05e-07, "loss": 0.0004, "num_tokens": 2713939.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 168.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.308904647827148, "kl": 0.05203948021517135, "learning_rate": 5.044444444444445e-07, "loss": 0.0277, "num_tokens": 2714164.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 168.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.1073240339756012, "kl": 0.12836672365665436, "learning_rate": 5.038888888888889e-07, "loss": 0.0065, "num_tokens": 2714505.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 168.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.451779842376709, "kl": 0.19776763021945953, "learning_rate": 5.033333333333334e-07, "loss": 0.0499, "num_tokens": 2714817.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 168.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.011935057118535042, "kl": 0.03180143050849438, "learning_rate": 5.027777777777778e-07, "loss": 0.0016, "num_tokens": 2715109.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 168.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.4198305308818817, "kl": 0.17676562815904617, "learning_rate": 5.022222222222222e-07, "loss": 0.0094, "num_tokens": 2715536.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.25, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 40.25, "completions/mean_terminated_length": 40.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 168.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.056661248207092285, "kl": 0.07448418438434601, "learning_rate": 5.016666666666667e-07, "loss": 0.0034, "num_tokens": 2715921.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 168.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02313900738954544, "kl": 0.003455116180703044, "learning_rate": 5.011111111111112e-07, "loss": 0.0002, "num_tokens": 2716205.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 37.25, "completions/mean_terminated_length": 37.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 168.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04919252172112465, "kl": 0.09829603880643845, "learning_rate": 5.005555555555556e-07, "loss": 0.0049, "num_tokens": 2716606.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 168.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 3.5669848918914795, "kl": 0.15583595354110003, "learning_rate": 5.000000000000001e-07, "loss": 0.0833, "num_tokens": 2716887.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 168.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03468798100948334, "kl": 0.00683140260662185, "learning_rate": 4.994444444444444e-07, "loss": 0.0004, "num_tokens": 2717149.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 168.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.277040719985962, "kl": 0.11582371685653925, "learning_rate": 4.98888888888889e-07, "loss": -0.0085, "num_tokens": 2717495.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 168.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.022680222988128662, "kl": 0.013720571994781494, "learning_rate": 4.983333333333334e-07, "loss": 0.0007, "num_tokens": 2717755.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 168.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.048508550971746445, "kl": 0.002723507466726005, "learning_rate": 4.977777777777778e-07, "loss": 0.0001, "num_tokens": 2718016.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 168.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.05467425659298897, "kl": 0.13094220496714115, "learning_rate": 4.972222222222223e-07, "loss": 0.0071, "num_tokens": 2718345.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 168.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.463857412338257, "kl": 0.3280081143602729, "learning_rate": 4.966666666666666e-07, "loss": -0.0272, "num_tokens": 2718702.0, "reward": 4.625, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 4.625, "rewards/reward_combined/std": 3.4731109142303467, "step": 9107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 168.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.06214684993028641, "kl": 0.13617508113384247, "learning_rate": 4.961111111111112e-07, "loss": 0.0068, "num_tokens": 2719047.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 168.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.024913154542446136, "kl": 0.002078409946989268, "learning_rate": 4.955555555555556e-07, "loss": 0.0001, "num_tokens": 2719356.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 168.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.132681205868721, "kl": 0.0360269658267498, "learning_rate": 4.95e-07, "loss": 0.0026, "num_tokens": 2719663.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 168.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.011689120903611183, "kl": 0.0031319059198722243, "learning_rate": 4.944444444444445e-07, "loss": 0.0001, "num_tokens": 2719965.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 168.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.012298086658120155, "kl": 0.002340660197660327, "learning_rate": 4.93888888888889e-07, "loss": 0.0001, "num_tokens": 2720219.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 168.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.09975802153348923, "kl": 0.00987153872847557, "learning_rate": 4.933333333333334e-07, "loss": 0.0005, "num_tokens": 2720479.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 168.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.30810511112213135, "kl": 0.05035156384110451, "learning_rate": 4.927777777777778e-07, "loss": 0.0029, "num_tokens": 2720757.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 168.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.023263370618224144, "kl": 0.008791058557108045, "learning_rate": 4.922222222222222e-07, "loss": 0.0004, "num_tokens": 2721059.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 168.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.22894148528575897, "kl": 0.025250872247852385, "learning_rate": 4.916666666666667e-07, "loss": 0.0014, "num_tokens": 2721357.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 168.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03200143203139305, "kl": 0.005850526737049222, "learning_rate": 4.911111111111112e-07, "loss": 0.0003, "num_tokens": 2721689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 168.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 7.223660945892334, "kl": 0.6061653941869736, "learning_rate": 4.905555555555556e-07, "loss": 0.2052, "num_tokens": 2721903.0, "reward": 2.0, "reward_std": 3.0, "rewards/reward_combined/mean": 2.0, "rewards/reward_combined/std": 3.0, "step": 9118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 168.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.021340006962418556, "kl": 0.002049380214884877, "learning_rate": 4.900000000000001e-07, "loss": 0.0001, "num_tokens": 2722222.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 168.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.008171357214450836, "kl": 0.2264852598309517, "learning_rate": 4.894444444444444e-07, "loss": 0.0113, "num_tokens": 2722524.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 168.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.02443399839103222, "kl": 0.0025632530450820923, "learning_rate": 4.88888888888889e-07, "loss": 0.0001, "num_tokens": 2722732.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 168.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.007980992086231709, "kl": 0.009614971932023764, "learning_rate": 4.883333333333334e-07, "loss": 0.0005, "num_tokens": 2723004.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 168.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.005722197238355875, "kl": 0.0019023361383005977, "learning_rate": 4.877777777777778e-07, "loss": 0.0001, "num_tokens": 2723281.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 168.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08791517466306686, "kl": 0.019341687206178904, "learning_rate": 4.872222222222223e-07, "loss": 0.001, "num_tokens": 2723576.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 168.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.36319607496261597, "kl": 0.12474739924073219, "learning_rate": 4.866666666666666e-07, "loss": 0.0062, "num_tokens": 2723938.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 169.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037274027708917856, "kl": 0.00018191039271187037, "learning_rate": 4.861111111111112e-07, "loss": 0.0, "num_tokens": 2724194.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 169.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.020597416907548904, "kl": 0.006176999391755089, "learning_rate": 4.855555555555556e-07, "loss": 0.0003, "num_tokens": 2724485.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 169.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.06238389387726784, "kl": 0.051564354449510574, "learning_rate": 4.85e-07, "loss": 0.0026, "num_tokens": 2724947.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 169.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.22446751594543457, "kl": 0.07746152579784393, "learning_rate": 4.844444444444445e-07, "loss": 0.0037, "num_tokens": 2725267.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 3.3920849091373384e-05, "kl": 4.775822162628174e-06, "learning_rate": 4.838888888888889e-07, "loss": 0.0, "num_tokens": 2725487.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 169.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.029136819764971733, "kl": 0.02434265799820423, "learning_rate": 4.833333333333334e-07, "loss": 0.0012, "num_tokens": 2725833.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 169.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00469029787927866, "kl": 0.0007073163869790733, "learning_rate": 4.827777777777779e-07, "loss": 0.0, "num_tokens": 2726053.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 169.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.025084884837269783, "kl": 0.06178387999534607, "learning_rate": 4.822222222222222e-07, "loss": 0.0031, "num_tokens": 2726421.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06706982105970383, "kl": 0.026134670712053776, "learning_rate": 4.816666666666668e-07, "loss": 0.0013, "num_tokens": 2726650.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.75, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 31.75, "completions/mean_terminated_length": 31.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 169.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.48723194003105164, "kl": 0.24309422075748444, "learning_rate": 4.811111111111111e-07, "loss": 0.012, "num_tokens": 2727045.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 169.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.9782469272613525, "kl": 0.013312173774465919, "learning_rate": 4.805555555555556e-07, "loss": 0.0092, "num_tokens": 2727378.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 169.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.1168050467967987, "kl": 0.03386817593127489, "learning_rate": 4.800000000000001e-07, "loss": 0.002, "num_tokens": 2727651.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 169.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.008335616439580917, "kl": 0.0013409869279712439, "learning_rate": 4.794444444444444e-07, "loss": 0.0001, "num_tokens": 2727963.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 169.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.04103755205869675, "kl": 0.15699506551027298, "learning_rate": 4.78888888888889e-07, "loss": 0.0079, "num_tokens": 2728269.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 169.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.043734174221754074, "kl": 0.010056297294795513, "learning_rate": 4.783333333333333e-07, "loss": 0.0005, "num_tokens": 2728587.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 169.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036731280852109194, "kl": 0.0030785053968429565, "learning_rate": 4.777777777777778e-07, "loss": 0.0002, "num_tokens": 2728847.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 169.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 4.437109470367432, "kl": 0.23824016749858856, "learning_rate": 4.772222222222223e-07, "loss": 0.1822, "num_tokens": 2729147.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 9142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.011018054559826851, "kl": 0.0009313106420449913, "learning_rate": 4.766666666666667e-07, "loss": 0.0, "num_tokens": 2729401.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 169.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06218692287802696, "kl": 0.01477856282144785, "learning_rate": 4.7611111111111113e-07, "loss": 0.0007, "num_tokens": 2729681.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 169.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.005697634536772966, "kl": 0.001898379297927022, "learning_rate": 4.7555555555555554e-07, "loss": 0.0001, "num_tokens": 2729958.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 169.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.026063507422804832, "kl": 0.011538487859070301, "learning_rate": 4.7500000000000006e-07, "loss": 0.0006, "num_tokens": 2730296.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 169.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.09171555191278458, "kl": 0.006186310667544603, "learning_rate": 4.7444444444444447e-07, "loss": 0.0003, "num_tokens": 2730539.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.193080186843872, "kl": 0.018744326196610928, "learning_rate": 4.7388888888888893e-07, "loss": 0.4588, "num_tokens": 2730849.0, "reward": 3.625, "reward_std": 0.75, "rewards/reward_combined/mean": 3.625, "rewards/reward_combined/std": 0.75, "step": 9148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 169.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.040331147611141205, "kl": 0.0060832020826637745, "learning_rate": 4.7333333333333334e-07, "loss": 0.0003, "num_tokens": 2731141.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 169.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.2106637954711914, "kl": 0.1391107141971588, "learning_rate": 4.7277777777777785e-07, "loss": 0.0196, "num_tokens": 2731500.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 169.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 4.175132751464844, "kl": 0.056023333221673965, "learning_rate": 4.7222222222222226e-07, "loss": 0.0045, "num_tokens": 2731796.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 169.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02246120572090149, "kl": 0.01364948321133852, "learning_rate": 4.716666666666667e-07, "loss": 0.0007, "num_tokens": 2732056.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 169.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.037438347935676575, "kl": 0.0022123324451968074, "learning_rate": 4.7111111111111113e-07, "loss": 0.0001, "num_tokens": 2732323.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 169.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.2939422130584717, "kl": 0.19739488512277603, "learning_rate": 4.7055555555555564e-07, "loss": -0.0067, "num_tokens": 2732648.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 169.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.029262783005833626, "kl": 0.003928191494196653, "learning_rate": 4.7000000000000005e-07, "loss": 0.0002, "num_tokens": 2732948.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.14412233233451843, "kl": 0.012253889814019203, "learning_rate": 4.6944444444444446e-07, "loss": 0.0007, "num_tokens": 2733161.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 169.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.882574200630188, "kl": 0.12361159175634384, "learning_rate": 4.688888888888889e-07, "loss": -0.2249, "num_tokens": 2733527.0, "reward": 6.0, "reward_std": 3.674234628677368, "rewards/reward_combined/mean": 6.0, "rewards/reward_combined/std": 3.674234628677368, "step": 9157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 169.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.08300977200269699, "kl": 0.01504704961553216, "learning_rate": 4.6833333333333333e-07, "loss": 0.0008, "num_tokens": 2733813.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 169.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00619354797527194, "kl": 0.22629867494106293, "learning_rate": 4.6777777777777785e-07, "loss": 0.0113, "num_tokens": 2734115.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 169.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.00930134579539299, "kl": 0.0035781769547611475, "learning_rate": 4.6722222222222225e-07, "loss": 0.0002, "num_tokens": 2734381.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 169.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.044464949518442154, "kl": 0.009481491521000862, "learning_rate": 4.666666666666667e-07, "loss": 0.0005, "num_tokens": 2734707.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 169.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076379189267754555, "kl": 0.0016494393348693848, "learning_rate": 4.661111111111111e-07, "loss": 0.0001, "num_tokens": 2734919.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 169.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.043818388134241104, "kl": 0.01974593475461006, "learning_rate": 4.6555555555555564e-07, "loss": 0.001, "num_tokens": 2735211.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 169.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.26463380455970764, "kl": 0.05022773321252316, "learning_rate": 4.6500000000000005e-07, "loss": 0.0028, "num_tokens": 2735499.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 169.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06165578216314316, "kl": 0.0459688026458025, "learning_rate": 4.6444444444444446e-07, "loss": 0.0025, "num_tokens": 2735775.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 169.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.21374008059501648, "kl": 0.03922738879919052, "learning_rate": 4.638888888888889e-07, "loss": 0.0019, "num_tokens": 2736115.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 169.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.7247251272201538, "kl": 0.17337842658162117, "learning_rate": 4.6333333333333333e-07, "loss": 0.0899, "num_tokens": 2736461.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 9167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 169.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03767053037881851, "kl": 0.0072277323342859745, "learning_rate": 4.6277777777777784e-07, "loss": 0.0004, "num_tokens": 2736733.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 169.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.21893464028835297, "kl": 0.026928784558549523, "learning_rate": 4.6222222222222225e-07, "loss": 0.0013, "num_tokens": 2736967.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 169.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.477327346801758, "kl": 0.1714181751012802, "learning_rate": 4.616666666666667e-07, "loss": 0.0271, "num_tokens": 2737290.0, "reward": 2.5, "reward_std": 1.7320507764816284, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 1.7320507764816284, "step": 9170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 169.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0113602289929986, "kl": 0.0014911949983797967, "learning_rate": 4.611111111111111e-07, "loss": 0.0001, "num_tokens": 2737612.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 169.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01050410233438015, "kl": 0.00018708706193137914, "learning_rate": 4.6055555555555563e-07, "loss": 0.0, "num_tokens": 2737868.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 169.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023988820612430573, "kl": 0.28227464854717255, "learning_rate": 4.6000000000000004e-07, "loss": 0.0141, "num_tokens": 2738156.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 169.88888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 4.997798919677734, "kl": 0.028479069471359253, "learning_rate": 4.594444444444445e-07, "loss": 0.1118, "num_tokens": 2738462.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 9174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 169.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.029909009113907814, "kl": 0.0008407905697822571, "learning_rate": 4.588888888888889e-07, "loss": 0.0, "num_tokens": 2738674.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 169.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.005925916135311127, "kl": 0.005744681693613529, "learning_rate": 4.583333333333333e-07, "loss": 0.0003, "num_tokens": 2738986.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 169.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02915809489786625, "kl": 0.01210392452776432, "learning_rate": 4.5777777777777784e-07, "loss": 0.0006, "num_tokens": 2739310.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 169.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03398146480321884, "kl": 0.002892606658861041, "learning_rate": 4.5722222222222224e-07, "loss": 0.0001, "num_tokens": 2739568.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 169.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01334507018327713, "kl": 0.43810221552848816, "learning_rate": 4.566666666666667e-07, "loss": 0.0219, "num_tokens": 2739852.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.0, "frac_reward_zero_std": 0.0, "grad_norm": 3.582838773727417, "kl": 0.020299060735851526, "learning_rate": 4.561111111111111e-07, "loss": 0.1678, "num_tokens": 2740150.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 170.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.690735101699829, "kl": 0.18161392211914062, "learning_rate": 4.5555555555555563e-07, "loss": 0.1164, "num_tokens": 2740553.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 9181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 170.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.3413526117801666, "kl": 0.19485041499137878, "learning_rate": 4.5500000000000004e-07, "loss": 0.0099, "num_tokens": 2740844.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 170.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075660268776118755, "kl": 0.001751735806465149, "learning_rate": 4.544444444444445e-07, "loss": 0.0001, "num_tokens": 2741056.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00134996825363487, "kl": 0.0017200048896484077, "learning_rate": 4.538888888888889e-07, "loss": 0.0001, "num_tokens": 2741336.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03941572457551956, "kl": 0.018383242655545473, "learning_rate": 4.533333333333334e-07, "loss": 0.0009, "num_tokens": 2741626.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 170.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.005028458312153816, "kl": 0.003152165561914444, "learning_rate": 4.5277777777777783e-07, "loss": 0.0002, "num_tokens": 2741886.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 170.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 3.642714727902785e-05, "kl": 4.366040229797363e-06, "learning_rate": 4.5222222222222224e-07, "loss": 0.0, "num_tokens": 2742106.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 170.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.19491547346115112, "kl": 0.01453420054167509, "learning_rate": 4.516666666666667e-07, "loss": 0.001, "num_tokens": 2742318.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 170.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.04470626264810562, "kl": 0.0023201137082651258, "learning_rate": 4.511111111111111e-07, "loss": 0.0001, "num_tokens": 2742578.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.10782022029161453, "kl": 0.26727497577667236, "learning_rate": 4.505555555555556e-07, "loss": 0.0127, "num_tokens": 2742876.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 170.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.15159238874912262, "kl": 0.09731388837099075, "learning_rate": 4.5000000000000003e-07, "loss": 0.0049, "num_tokens": 2743249.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.011005835607647896, "kl": 0.0031815325492061675, "learning_rate": 4.494444444444445e-07, "loss": 0.0002, "num_tokens": 2743553.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 170.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03218679875135422, "kl": 0.002324662869796157, "learning_rate": 4.488888888888889e-07, "loss": 0.0001, "num_tokens": 2743875.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 170.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.405368447303772, "kl": 0.14651182293891907, "learning_rate": 4.483333333333334e-07, "loss": 0.008, "num_tokens": 2744205.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 170.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.526332139968872, "kl": 0.5533760488033295, "learning_rate": 4.477777777777778e-07, "loss": 0.1765, "num_tokens": 2744472.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 170.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06847608834505081, "kl": 0.0033054965315386653, "learning_rate": 4.4722222222222223e-07, "loss": 0.0002, "num_tokens": 2744694.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9196 }, { "clip_ratio/high_max": 0.02380952425301075, "clip_ratio/high_mean": 0.02380952425301075, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02380952425301075, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 170.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 5.344804286956787, "kl": 0.021148365456610918, "learning_rate": 4.466666666666667e-07, "loss": 0.0238, "num_tokens": 2744984.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 170.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02386375330388546, "kl": 0.01020551286637783, "learning_rate": 4.461111111111111e-07, "loss": 0.0005, "num_tokens": 2745296.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 170.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.011796833015978336, "kl": 0.002591715776361525, "learning_rate": 4.455555555555556e-07, "loss": 0.0001, "num_tokens": 2745550.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 170.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.18732912838459015, "kl": 0.07958622835576534, "learning_rate": 4.4500000000000003e-07, "loss": 0.0039, "num_tokens": 2745868.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 170.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.061997488141059875, "kl": 0.039953846484422684, "learning_rate": 4.444444444444445e-07, "loss": 0.0021, "num_tokens": 2746139.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 170.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.07471345365047455, "kl": 0.018615826964378357, "learning_rate": 4.438888888888889e-07, "loss": 0.0009, "num_tokens": 2746431.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 170.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04989686608314514, "kl": 0.1400460209697485, "learning_rate": 4.433333333333334e-07, "loss": 0.0068, "num_tokens": 2746769.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9203 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.01666666753590107, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01666666753590107, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 6.813302516937256, "kl": 0.08383697643876076, "learning_rate": 4.427777777777778e-07, "loss": 0.0375, "num_tokens": 2747051.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 9204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 170.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.11603619903326035, "kl": 0.011525760171934962, "learning_rate": 4.422222222222223e-07, "loss": 0.0008, "num_tokens": 2747294.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 170.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04477782920002937, "kl": 0.02519911155104637, "learning_rate": 4.416666666666667e-07, "loss": 0.0011, "num_tokens": 2747676.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.5, "frac_reward_zero_std": 0.0, "grad_norm": 4.625666618347168, "kl": 0.7025375217199326, "learning_rate": 4.411111111111111e-07, "loss": 0.0497, "num_tokens": 2747977.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 25.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 170.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.448716402053833, "kl": 0.12398221343755722, "learning_rate": 4.405555555555556e-07, "loss": -0.0125, "num_tokens": 2748307.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.022364065051078796, "kl": 0.0032785963267087936, "learning_rate": 4.4e-07, "loss": 0.0002, "num_tokens": 2748593.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 170.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.02622807025909424, "kl": 0.0015753828920423985, "learning_rate": 4.394444444444445e-07, "loss": 0.0001, "num_tokens": 2748909.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 170.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06365285068750381, "kl": 0.018784526735544205, "learning_rate": 4.388888888888889e-07, "loss": 0.0009, "num_tokens": 2749244.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 170.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.363097608089447, "kl": 0.07438035402446985, "learning_rate": 4.383333333333334e-07, "loss": 0.0034, "num_tokens": 2749584.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.25, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.25, "completions/mean_terminated_length": 31.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 170.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.09180688112974167, "kl": 0.1289983093738556, "learning_rate": 4.377777777777778e-07, "loss": 0.0064, "num_tokens": 2749933.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.849061012268066, "kl": 0.12089084833860397, "learning_rate": 4.372222222222223e-07, "loss": 0.0962, "num_tokens": 2750213.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 170.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08175742626190186, "kl": 0.06093762721866369, "learning_rate": 4.366666666666667e-07, "loss": 0.0029, "num_tokens": 2750486.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 170.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.009001275524497032, "kl": 0.00044495612382888794, "learning_rate": 4.361111111111112e-07, "loss": 0.0, "num_tokens": 2750698.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 170.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 2.502195358276367, "kl": 0.10142811015248299, "learning_rate": 4.355555555555556e-07, "loss": -0.0306, "num_tokens": 2751098.0, "reward": 3.5, "reward_std": 4.690415859222412, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 4.690415859222412, "step": 9217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 170.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.011901093646883965, "kl": 0.008176930248737335, "learning_rate": 4.35e-07, "loss": 0.0004, "num_tokens": 2751334.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 170.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.009995866566896439, "kl": 0.026908092200756073, "learning_rate": 4.344444444444445e-07, "loss": 0.0013, "num_tokens": 2751550.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008064515888690948, "clip_ratio/low_min": 0.008064515888690948, "clip_ratio/region_mean": 0.008064515888690948, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 170.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.1733789443969727, "kl": 0.09904501587152481, "learning_rate": 4.338888888888889e-07, "loss": 0.1091, "num_tokens": 2751912.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 170.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.036667875945568085, "kl": 0.07451936602592468, "learning_rate": 4.333333333333334e-07, "loss": 0.0037, "num_tokens": 2752354.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 170.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02417045086622238, "kl": 0.00665917806327343, "learning_rate": 4.327777777777778e-07, "loss": 0.0003, "num_tokens": 2752685.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 170.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 3.1430306434631348, "kl": 0.12310035899281502, "learning_rate": 4.3222222222222227e-07, "loss": 0.0948, "num_tokens": 2753024.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 170.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01849512569606304, "kl": 0.004345661727711558, "learning_rate": 4.316666666666667e-07, "loss": 0.0002, "num_tokens": 2753324.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 170.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.031873032450675964, "kl": 0.009760612156242132, "learning_rate": 4.311111111111112e-07, "loss": 0.0005, "num_tokens": 2753612.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 170.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.011758649721741676, "kl": 0.4379705637693405, "learning_rate": 4.305555555555556e-07, "loss": 0.0219, "num_tokens": 2753896.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 170.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.013530576601624489, "kl": 0.0009603872895240784, "learning_rate": 4.3e-07, "loss": 0.0, "num_tokens": 2754140.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 170.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.014769203029572964, "kl": 0.011093411128968, "learning_rate": 4.294444444444445e-07, "loss": 0.0005, "num_tokens": 2754401.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 170.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.5282080769538879, "kl": 0.09214693959802389, "learning_rate": 4.288888888888889e-07, "loss": 0.0052, "num_tokens": 2754703.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 170.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.004612156189978123, "kl": 0.00015295148114091717, "learning_rate": 4.283333333333334e-07, "loss": 0.0, "num_tokens": 2754959.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 170.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 6.885163307189941, "kl": 0.19222988933324814, "learning_rate": 4.277777777777778e-07, "loss": 0.0115, "num_tokens": 2755273.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 170.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.027531513944268227, "kl": 0.0022567480918951333, "learning_rate": 4.2722222222222227e-07, "loss": 0.0001, "num_tokens": 2755533.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 170.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.1337239742279053, "kl": 0.12007967568933964, "learning_rate": 4.266666666666667e-07, "loss": -0.007, "num_tokens": 2755987.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 9233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.10576338320970535, "kl": 0.0316583983367309, "learning_rate": 4.261111111111112e-07, "loss": 0.0017, "num_tokens": 2756285.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 171.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.007807786110788584, "kl": 0.008418784476816654, "learning_rate": 4.255555555555556e-07, "loss": 0.0004, "num_tokens": 2756597.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 171.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.013529427349567413, "kl": 0.2520129233598709, "learning_rate": 4.2500000000000006e-07, "loss": 0.0126, "num_tokens": 2756895.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 171.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.05471699312329292, "kl": 0.02231462486088276, "learning_rate": 4.2444444444444447e-07, "loss": 0.0012, "num_tokens": 2757183.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.12682951986789703, "kl": 0.009996035252697766, "learning_rate": 4.238888888888889e-07, "loss": 0.0005, "num_tokens": 2757481.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.009396976791322231, "kl": 0.002153612906113267, "learning_rate": 4.233333333333334e-07, "loss": 0.0001, "num_tokens": 2757758.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 171.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.020909814164042473, "kl": 0.014016300905495882, "learning_rate": 4.227777777777778e-07, "loss": 0.0007, "num_tokens": 2758018.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 171.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03297625854611397, "kl": 0.0011068043240811676, "learning_rate": 4.2222222222222226e-07, "loss": 0.0001, "num_tokens": 2758251.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 171.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05795174464583397, "kl": 0.07153275981545448, "learning_rate": 4.2166666666666667e-07, "loss": 0.0036, "num_tokens": 2758623.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.05927801877260208, "kl": 0.04159258771687746, "learning_rate": 4.211111111111112e-07, "loss": 0.0022, "num_tokens": 2758895.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 171.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.008216489106416702, "kl": 0.001389084558468312, "learning_rate": 4.205555555555556e-07, "loss": 0.0001, "num_tokens": 2759207.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 171.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.058453865349292755, "kl": 0.15302303433418274, "learning_rate": 4.2000000000000006e-07, "loss": 0.0076, "num_tokens": 2759518.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 171.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.019427264109253883, "kl": 0.00416694994783029, "learning_rate": 4.1944444444444446e-07, "loss": 0.0002, "num_tokens": 2759806.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 171.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.022236425429582596, "kl": 0.009421465452760458, "learning_rate": 4.18888888888889e-07, "loss": 0.0005, "num_tokens": 2760122.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 171.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.031701233237981796, "kl": 0.01179852127097547, "learning_rate": 4.183333333333334e-07, "loss": 0.0006, "num_tokens": 2760456.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.5, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 171.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.713262677192688, "kl": 0.05352906323969364, "learning_rate": 4.177777777777778e-07, "loss": 0.0207, "num_tokens": 2760938.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 9249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 171.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0752776712179184, "kl": 0.08315620571374893, "learning_rate": 4.1722222222222226e-07, "loss": 0.0042, "num_tokens": 2761375.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 171.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01593841053545475, "kl": 0.002375622047111392, "learning_rate": 4.1666666666666667e-07, "loss": 0.0001, "num_tokens": 2761631.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 171.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02261182852089405, "kl": 0.007661239709705114, "learning_rate": 4.161111111111112e-07, "loss": 0.0004, "num_tokens": 2761933.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 171.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.012981032021343708, "kl": 0.4379245340824127, "learning_rate": 4.155555555555556e-07, "loss": 0.0219, "num_tokens": 2762217.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 171.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.019729923456907272, "kl": 0.0003616735339164734, "learning_rate": 4.1500000000000005e-07, "loss": 0.0, "num_tokens": 2762430.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 171.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.006748690735548735, "kl": 0.0005747824907302856, "learning_rate": 4.1444444444444446e-07, "loss": 0.0, "num_tokens": 2762674.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 171.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03384356200695038, "kl": 0.023138395976275206, "learning_rate": 4.1388888888888897e-07, "loss": 0.0011, "num_tokens": 2763081.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 171.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.022757429629564285, "kl": 0.0007800966559443623, "learning_rate": 4.133333333333334e-07, "loss": 0.0, "num_tokens": 2763337.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 171.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.422699451446533, "kl": 0.11455383524298668, "learning_rate": 4.127777777777778e-07, "loss": -0.0018, "num_tokens": 2763684.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08427798748016357, "kl": 0.004451051587238908, "learning_rate": 4.1222222222222225e-07, "loss": 0.0002, "num_tokens": 2763949.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 171.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.19470073282718658, "kl": 0.0673128142952919, "learning_rate": 4.1166666666666666e-07, "loss": 0.0033, "num_tokens": 2764249.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 171.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.004520250950008631, "kl": 0.0006215214671101421, "learning_rate": 4.111111111111112e-07, "loss": 0.0, "num_tokens": 2764469.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 171.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.043426431715488434, "kl": 0.006902510765939951, "learning_rate": 4.105555555555556e-07, "loss": 0.0003, "num_tokens": 2764803.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 171.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.29461541771888733, "kl": 0.015286177396774292, "learning_rate": 4.1000000000000004e-07, "loss": 0.0011, "num_tokens": 2765015.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 171.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.8931791186332703, "kl": 0.13321826234459877, "learning_rate": 4.0944444444444445e-07, "loss": 0.0075, "num_tokens": 2765381.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 171.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.9833085536956787, "kl": 0.4814579263329506, "learning_rate": 4.0888888888888897e-07, "loss": -0.3232, "num_tokens": 2765725.0, "reward": 6.125, "reward_std": 3.4247870445251465, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.4247870445251465, "step": 9265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 171.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.12609098851680756, "kl": 0.02914639189839363, "learning_rate": 4.083333333333334e-07, "loss": 0.0015, "num_tokens": 2765952.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01647152751684189, "kl": 0.012256576213985682, "learning_rate": 4.0777777777777784e-07, "loss": 0.0006, "num_tokens": 2766238.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 171.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.11514153331518173, "kl": 0.041497090831398964, "learning_rate": 4.0722222222222225e-07, "loss": 0.0022, "num_tokens": 2766510.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 171.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.026926137506961823, "kl": 0.005351900064852089, "learning_rate": 4.0666666666666666e-07, "loss": 0.0003, "num_tokens": 2766814.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 171.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.02806163765490055, "kl": 0.004474507179111242, "learning_rate": 4.0611111111111117e-07, "loss": 0.0002, "num_tokens": 2767076.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 171.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0288766510784626, "kl": 0.0022042092168703675, "learning_rate": 4.055555555555556e-07, "loss": 0.0001, "num_tokens": 2767399.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 171.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023210991639643908, "kl": 0.28223420679569244, "learning_rate": 4.0500000000000004e-07, "loss": 0.0141, "num_tokens": 2767687.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 171.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.016210738569498062, "kl": 0.0027348033618181944, "learning_rate": 4.0444444444444445e-07, "loss": 0.0001, "num_tokens": 2767971.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 171.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0055869752541184425, "kl": 0.003232792019844055, "learning_rate": 4.0388888888888896e-07, "loss": 0.0002, "num_tokens": 2768231.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9274 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completion_length": 55.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 171.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.9915540218353271, "kl": 0.12986332550644875, "learning_rate": 4.0333333333333337e-07, "loss": 0.3857, "num_tokens": 2768667.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.22334958612918854, "kl": 0.05932823475450277, "learning_rate": 4.0277777777777783e-07, "loss": 0.0032, "num_tokens": 2768948.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 171.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.06022645905613899, "kl": 0.025689803063869476, "learning_rate": 4.0222222222222224e-07, "loss": 0.0013, "num_tokens": 2769240.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 171.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08687689900398254, "kl": 0.014630092307925224, "learning_rate": 4.0166666666666676e-07, "loss": 0.0008, "num_tokens": 2769512.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 171.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.7389340996742249, "kl": 0.2611394003033638, "learning_rate": 4.0111111111111116e-07, "loss": 0.0141, "num_tokens": 2769805.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 171.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04972091689705849, "kl": 0.012495948700234294, "learning_rate": 4.0055555555555557e-07, "loss": 0.0006, "num_tokens": 2770077.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 171.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.1804299354553223, "kl": 0.11634692549705505, "learning_rate": 4.0000000000000003e-07, "loss": 0.1156, "num_tokens": 2770403.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 171.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.041615042835474014, "kl": 0.012189901899546385, "learning_rate": 3.9944444444444444e-07, "loss": 0.0006, "num_tokens": 2770734.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 171.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.011744311079382896, "kl": 0.008232608437538147, "learning_rate": 3.9888888888888896e-07, "loss": 0.0004, "num_tokens": 2770970.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 171.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.007641908247023821, "kl": 0.0016687363386154175, "learning_rate": 3.9833333333333337e-07, "loss": 0.0001, "num_tokens": 2771182.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 171.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.2311616986989975, "kl": 0.13791117817163467, "learning_rate": 3.9777777777777783e-07, "loss": 0.0071, "num_tokens": 2771541.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 171.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 4.680829442804679e-05, "kl": 4.902482032775879e-06, "learning_rate": 3.9722222222222224e-07, "loss": 0.0, "num_tokens": 2771761.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 171.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06455623358488083, "kl": 0.009478762280195951, "learning_rate": 3.9666666666666675e-07, "loss": 0.0005, "num_tokens": 2772096.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.06341173499822617, "kl": 0.013934919610619545, "learning_rate": 3.9611111111111116e-07, "loss": 0.0007, "num_tokens": 2772358.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 172.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02178741805255413, "kl": 0.0035754453856498003, "learning_rate": 3.9555555555555557e-07, "loss": 0.0002, "num_tokens": 2772656.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 172.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008802296593785286, "kl": 0.0037445584312081337, "learning_rate": 3.9500000000000003e-07, "loss": 0.0002, "num_tokens": 2772960.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 172.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.13402767479419708, "kl": 0.014036783948540688, "learning_rate": 3.9444444444444444e-07, "loss": 0.0008, "num_tokens": 2773231.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 172.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 4.79798654851038e-05, "kl": 5.0067901611328125e-06, "learning_rate": 3.9388888888888895e-07, "loss": 0.0, "num_tokens": 2773451.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 172.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.021560505032539368, "kl": 0.007062884280458093, "learning_rate": 3.9333333333333336e-07, "loss": 0.0004, "num_tokens": 2773769.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 172.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03935709223151207, "kl": 0.0036150331143289804, "learning_rate": 3.927777777777778e-07, "loss": 0.0002, "num_tokens": 2774082.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 172.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.07741757482290268, "kl": 0.020517602562904358, "learning_rate": 3.9222222222222223e-07, "loss": 0.001, "num_tokens": 2774423.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 172.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.016936616972088814, "kl": 0.006107708089984953, "learning_rate": 3.9166666666666675e-07, "loss": 0.0003, "num_tokens": 2774715.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 172.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.7078194618225098, "kl": 0.1179865151643753, "learning_rate": 3.9111111111111115e-07, "loss": 0.005, "num_tokens": 2775089.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 9297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.3263930082321167, "kl": 0.02732260152697563, "learning_rate": 3.905555555555556e-07, "loss": 0.0016, "num_tokens": 2775377.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 172.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.021657992154359818, "kl": 0.013733734842389822, "learning_rate": 3.9e-07, "loss": 0.0007, "num_tokens": 2775637.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 172.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763224095106125, "kl": 0.11507385224103928, "learning_rate": 3.8944444444444443e-07, "loss": 0.0058, "num_tokens": 2775972.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 172.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.023219669237732887, "kl": 0.010506406426429749, "learning_rate": 3.8888888888888895e-07, "loss": 0.0005, "num_tokens": 2776284.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0170900896191597, "kl": 0.00035499632213031873, "learning_rate": 3.8833333333333336e-07, "loss": 0.0, "num_tokens": 2776540.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 172.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.025064218789339066, "kl": 0.024807901121675968, "learning_rate": 3.877777777777778e-07, "loss": 0.0012, "num_tokens": 2776766.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 172.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.01141685713082552, "kl": 0.17285750061273575, "learning_rate": 3.872222222222222e-07, "loss": 0.0086, "num_tokens": 2777075.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 172.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05613263323903084, "kl": 0.2802729308605194, "learning_rate": 3.8666666666666674e-07, "loss": 0.014, "num_tokens": 2777364.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 172.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 3.4195351600646973, "kl": 0.1858106330037117, "learning_rate": 3.8611111111111115e-07, "loss": 0.0797, "num_tokens": 2777656.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 172.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02490217797458172, "kl": 0.0004833415150642395, "learning_rate": 3.855555555555556e-07, "loss": 0.0, "num_tokens": 2777868.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 172.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.08979335427284241, "kl": 0.01594660885166377, "learning_rate": 3.85e-07, "loss": 0.0008, "num_tokens": 2778200.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 172.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02090134285390377, "kl": 0.0064416201785206795, "learning_rate": 3.8444444444444453e-07, "loss": 0.0003, "num_tokens": 2778472.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 172.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.007593576796352863, "kl": 0.0016336292028427124, "learning_rate": 3.8388888888888894e-07, "loss": 0.0001, "num_tokens": 2778684.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 172.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028444717172533274, "kl": 0.001354836334940046, "learning_rate": 3.8333333333333335e-07, "loss": 0.0001, "num_tokens": 2779002.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 172.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044402750208973885, "kl": 0.0005992591031827033, "learning_rate": 3.827777777777778e-07, "loss": 0.0, "num_tokens": 2779222.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 172.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.022290678694844246, "kl": 0.4309501051902771, "learning_rate": 3.822222222222222e-07, "loss": 0.0215, "num_tokens": 2779506.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 172.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.045792512595653534, "kl": 0.04944424144923687, "learning_rate": 3.8166666666666674e-07, "loss": 0.0024, "num_tokens": 2779970.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 172.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.018603837117552757, "kl": 0.0022268511820584536, "learning_rate": 3.8111111111111114e-07, "loss": 0.0001, "num_tokens": 2780226.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 172.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06011150777339935, "kl": 0.003202415187843144, "learning_rate": 3.805555555555556e-07, "loss": 0.0002, "num_tokens": 2780488.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 172.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00758792320266366, "kl": 0.004096812102943659, "learning_rate": 3.8e-07, "loss": 0.0002, "num_tokens": 2780752.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 172.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.014081151224672794, "kl": 0.003179658204317093, "learning_rate": 3.7944444444444453e-07, "loss": 0.0002, "num_tokens": 2781012.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 172.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.06821320950984955, "kl": 0.016539629083126783, "learning_rate": 3.7888888888888894e-07, "loss": 0.0008, "num_tokens": 2781340.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 172.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.009479859843850136, "kl": 0.002620329149067402, "learning_rate": 3.7833333333333335e-07, "loss": 0.0001, "num_tokens": 2781620.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 172.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01924264244735241, "kl": 0.0691790021955967, "learning_rate": 3.777777777777778e-07, "loss": 0.0035, "num_tokens": 2781988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 172.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.038289424031972885, "kl": 0.24325392954051495, "learning_rate": 3.772222222222222e-07, "loss": 0.0122, "num_tokens": 2782316.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 172.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.052594758570194244, "kl": 0.1251644492149353, "learning_rate": 3.7666666666666673e-07, "loss": 0.0062, "num_tokens": 2782648.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 172.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009429831989109516, "kl": 0.001698523759841919, "learning_rate": 3.7611111111111114e-07, "loss": 0.0001, "num_tokens": 2782928.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 172.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1377951055765152, "kl": 0.01817973703145981, "learning_rate": 3.755555555555556e-07, "loss": 0.0011, "num_tokens": 2783179.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 172.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.3073562979698181, "kl": 0.04190004616975784, "learning_rate": 3.75e-07, "loss": 0.0021, "num_tokens": 2783467.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 172.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.6353869438171387, "kl": 0.09600532054901123, "learning_rate": 3.744444444444445e-07, "loss": -0.0215, "num_tokens": 2783763.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 172.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.09292547404766083, "kl": 0.037654574029147625, "learning_rate": 3.7388888888888893e-07, "loss": 0.0021, "num_tokens": 2784035.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 172.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.025060992687940598, "kl": 0.023356677033007145, "learning_rate": 3.733333333333334e-07, "loss": 0.0012, "num_tokens": 2784325.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 172.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 2.1117775440216064, "kl": 0.14538377802819014, "learning_rate": 3.727777777777778e-07, "loss": -0.0348, "num_tokens": 2784699.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 172.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.008886339142918587, "kl": 0.23915622383356094, "learning_rate": 3.722222222222222e-07, "loss": 0.0119, "num_tokens": 2784999.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 172.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07952993363142014, "kl": 0.0076755882473662496, "learning_rate": 3.716666666666667e-07, "loss": 0.0004, "num_tokens": 2785239.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 172.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.11312278360128403, "kl": 0.037068165838718414, "learning_rate": 3.7111111111111113e-07, "loss": 0.0018, "num_tokens": 2785533.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.25, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 39.25, "completions/mean_terminated_length": 39.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 172.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.7949790954589844, "kl": 0.19212953373789787, "learning_rate": 3.705555555555556e-07, "loss": -0.0101, "num_tokens": 2785914.0, "reward": 4.75, "reward_std": 5.5, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 5.5, "step": 9334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 172.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 0.8705803155899048, "kl": 0.08215533196926117, "learning_rate": 3.7e-07, "loss": 0.004, "num_tokens": 2786359.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 172.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.011643679812550545, "kl": 0.008234910666942596, "learning_rate": 3.694444444444445e-07, "loss": 0.0004, "num_tokens": 2786595.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 172.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 6.486974239349365, "kl": 0.02966504730284214, "learning_rate": 3.6888888888888893e-07, "loss": 0.3481, "num_tokens": 2786825.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 172.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.1584530770778656, "kl": 0.09604036435484886, "learning_rate": 3.683333333333334e-07, "loss": 0.0048, "num_tokens": 2787125.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 172.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.06752997636795044, "kl": 0.008278721012175083, "learning_rate": 3.677777777777778e-07, "loss": 0.0004, "num_tokens": 2787391.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 172.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.43636441230773926, "kl": 0.2039307877421379, "learning_rate": 3.6722222222222226e-07, "loss": 0.0102, "num_tokens": 2787737.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 172.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.560497522354126, "kl": 0.017366907559335232, "learning_rate": 3.666666666666667e-07, "loss": -0.0233, "num_tokens": 2788037.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 173.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.037638626992702484, "kl": 0.012421516701579094, "learning_rate": 3.6611111111111113e-07, "loss": 0.0006, "num_tokens": 2788313.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 24.25, "completions/mean_terminated_length": 24.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 173.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03006104938685894, "kl": 0.01363749336451292, "learning_rate": 3.655555555555556e-07, "loss": 0.0007, "num_tokens": 2788658.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03084772825241089, "kl": 0.01855398342013359, "learning_rate": 3.65e-07, "loss": 0.0009, "num_tokens": 2788926.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 173.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0235455222427845, "kl": 0.010349012911319733, "learning_rate": 3.644444444444445e-07, "loss": 0.0005, "num_tokens": 2789238.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 173.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0176225733011961, "kl": 0.0013293921947479248, "learning_rate": 3.638888888888889e-07, "loss": 0.0001, "num_tokens": 2789552.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 173.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02964184060692787, "kl": 0.0028348092455416918, "learning_rate": 3.633333333333334e-07, "loss": 0.0001, "num_tokens": 2789808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 173.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05550900474190712, "kl": 0.0101740681566298, "learning_rate": 3.627777777777778e-07, "loss": 0.0005, "num_tokens": 2790098.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 173.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02350495755672455, "kl": 0.04414127394556999, "learning_rate": 3.622222222222223e-07, "loss": 0.0022, "num_tokens": 2790566.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 173.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.01052885316312313, "kl": 0.0032545290887355804, "learning_rate": 3.616666666666667e-07, "loss": 0.0002, "num_tokens": 2790826.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 31.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 173.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03480667620897293, "kl": 0.0226528849452734, "learning_rate": 3.611111111111111e-07, "loss": 0.0012, "num_tokens": 2791172.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 173.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0426400862634182, "kl": 0.006377564277499914, "learning_rate": 3.605555555555556e-07, "loss": 0.0004, "num_tokens": 2791503.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 173.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03814734145998955, "kl": 0.002475414308719337, "learning_rate": 3.6e-07, "loss": 0.0001, "num_tokens": 2791771.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 173.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.10539256781339645, "kl": 0.01669210009276867, "learning_rate": 3.594444444444445e-07, "loss": 0.0009, "num_tokens": 2792055.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 173.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.050645679235458374, "kl": 0.015026289038360119, "learning_rate": 3.588888888888889e-07, "loss": 0.0008, "num_tokens": 2792353.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 173.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.3748486638069153, "kl": 0.11952191963791847, "learning_rate": 3.583333333333334e-07, "loss": 0.0061, "num_tokens": 2792701.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 173.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.007651558145880699, "kl": 0.0016255378723144531, "learning_rate": 3.577777777777778e-07, "loss": 0.0001, "num_tokens": 2792913.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 173.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 5.832100578118116e-05, "kl": 5.260109901428223e-06, "learning_rate": 3.572222222222223e-07, "loss": 0.0, "num_tokens": 2793133.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9358 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.011904762126505375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011904762126505375, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 173.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 4.833471298217773, "kl": 0.1954428267199546, "learning_rate": 3.566666666666667e-07, "loss": 0.1257, "num_tokens": 2793433.0, "reward": 7.25, "reward_std": 1.5, "rewards/reward_combined/mean": 7.25, "rewards/reward_combined/std": 1.5, "step": 9359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 173.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.07354491204023361, "kl": 0.026724157854914665, "learning_rate": 3.5611111111111117e-07, "loss": 0.0014, "num_tokens": 2793775.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 173.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.014892864041030407, "kl": 0.4379664659500122, "learning_rate": 3.555555555555556e-07, "loss": 0.0219, "num_tokens": 2794059.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 173.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.017869343981146812, "kl": 0.001170838891994208, "learning_rate": 3.55e-07, "loss": 0.0001, "num_tokens": 2794294.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 173.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.2607446908950806, "kl": 0.19457008689641953, "learning_rate": 3.544444444444445e-07, "loss": 0.4534, "num_tokens": 2794834.0, "reward": 6.050000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 6.050000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 9363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 173.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.28941234946250916, "kl": 0.04916580580174923, "learning_rate": 3.538888888888889e-07, "loss": 0.0025, "num_tokens": 2795176.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 173.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.011151889339089394, "kl": 0.0021769776940345764, "learning_rate": 3.533333333333334e-07, "loss": 0.0001, "num_tokens": 2795382.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 173.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.004458112176507711, "kl": 0.0006399989069905132, "learning_rate": 3.527777777777778e-07, "loss": 0.0, "num_tokens": 2795602.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 173.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02884458750486374, "kl": 0.008445583982393146, "learning_rate": 3.522222222222223e-07, "loss": 0.0004, "num_tokens": 2795936.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 173.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02732694521546364, "kl": 0.0034783007577061653, "learning_rate": 3.516666666666667e-07, "loss": 0.0002, "num_tokens": 2796242.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 173.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.011344692669808865, "kl": 0.00036744773387908936, "learning_rate": 3.5111111111111117e-07, "loss": 0.0, "num_tokens": 2796454.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 173.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02304164133965969, "kl": 0.0047009720001369715, "learning_rate": 3.505555555555556e-07, "loss": 0.0002, "num_tokens": 2796714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 173.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.4819320738315582, "kl": 0.15312842279672623, "learning_rate": 3.5000000000000004e-07, "loss": 0.0079, "num_tokens": 2797118.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 173.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.7668307423591614, "kl": 0.08765116333961487, "learning_rate": 3.494444444444445e-07, "loss": 0.0044, "num_tokens": 2797334.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 173.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 0.5543175935745239, "kl": 0.5499984044581652, "learning_rate": 3.488888888888889e-07, "loss": 0.0104, "num_tokens": 2797628.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 173.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.287566214799881, "kl": 0.061633121222257614, "learning_rate": 3.4833333333333337e-07, "loss": 0.0032, "num_tokens": 2797946.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 173.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 5.613457202911377, "kl": 0.08656823635101318, "learning_rate": 3.477777777777778e-07, "loss": 0.0621, "num_tokens": 2798192.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 173.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.021501949056982994, "kl": 0.0011914208880625665, "learning_rate": 3.472222222222223e-07, "loss": 0.0001, "num_tokens": 2798446.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 173.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07699888199567795, "kl": 0.0601310208439827, "learning_rate": 3.466666666666667e-07, "loss": 0.0031, "num_tokens": 2798736.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 173.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.8836774826049805, "kl": 0.10593215376138687, "learning_rate": 3.4611111111111116e-07, "loss": 0.0455, "num_tokens": 2799084.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 173.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.12636886537075043, "kl": 0.15146322548389435, "learning_rate": 3.4555555555555557e-07, "loss": 0.0076, "num_tokens": 2799392.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 173.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.011707697063684464, "kl": 0.008209742605686188, "learning_rate": 3.4500000000000003e-07, "loss": 0.0004, "num_tokens": 2799628.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 173.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0872398242354393, "kl": 0.02828060369938612, "learning_rate": 3.444444444444445e-07, "loss": 0.0014, "num_tokens": 2799917.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 173.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.021690569818019867, "kl": 0.013805336318910122, "learning_rate": 3.438888888888889e-07, "loss": 0.0007, "num_tokens": 2800177.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 173.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.03020733967423439, "kl": 0.011163719464093447, "learning_rate": 3.4333333333333336e-07, "loss": 0.0006, "num_tokens": 2800498.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 4.0380072593688965, "kl": 0.614412693772465, "learning_rate": 3.4277777777777777e-07, "loss": 0.0734, "num_tokens": 2800796.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 173.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03920014947652817, "kl": 0.254609078168869, "learning_rate": 3.422222222222223e-07, "loss": 0.0124, "num_tokens": 2801096.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 173.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.3304016590118408, "kl": 0.1523362696170807, "learning_rate": 3.416666666666667e-07, "loss": 0.1016, "num_tokens": 2801507.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9386 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 173.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.185850143432617, "kl": 0.2425401657819748, "learning_rate": 3.4111111111111116e-07, "loss": 0.014, "num_tokens": 2801836.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 173.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.08394826203584671, "kl": 0.01452213665470481, "learning_rate": 3.4055555555555557e-07, "loss": 0.0008, "num_tokens": 2802124.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 173.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.009809685871005058, "kl": 0.0021444990998134017, "learning_rate": 3.4000000000000003e-07, "loss": 0.0001, "num_tokens": 2802401.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 173.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.003263027872890234, "kl": 0.00019645095017040148, "learning_rate": 3.394444444444445e-07, "loss": 0.0, "num_tokens": 2802657.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 173.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.05833924189209938, "kl": 0.008069396717473865, "learning_rate": 3.3888888888888895e-07, "loss": 0.0004, "num_tokens": 2802987.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 173.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.3466057777404785, "kl": 0.2238062173128128, "learning_rate": 3.3833333333333336e-07, "loss": 0.0221, "num_tokens": 2803287.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.75, "completions/mean_terminated_length": 36.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 173.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.11042548716068268, "kl": 0.08636715821921825, "learning_rate": 3.3777777777777777e-07, "loss": 0.0045, "num_tokens": 2803662.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 173.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08294305950403214, "kl": 0.030380502808839083, "learning_rate": 3.372222222222223e-07, "loss": 0.0015, "num_tokens": 2804028.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 173.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.08033723384141922, "kl": 0.0045998357236385345, "learning_rate": 3.366666666666667e-07, "loss": 0.0002, "num_tokens": 2804291.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.014550693333148956, "kl": 0.008843765826895833, "learning_rate": 3.3611111111111115e-07, "loss": 0.0004, "num_tokens": 2804563.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 174.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.06354214251041412, "kl": 0.010884525254368782, "learning_rate": 3.3555555555555556e-07, "loss": 0.0005, "num_tokens": 2804893.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 174.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.13466739654541, "kl": 0.17194481403566897, "learning_rate": 3.35e-07, "loss": 0.0953, "num_tokens": 2805217.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 174.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.010318756103515625, "kl": 0.0010359346924815327, "learning_rate": 3.344444444444445e-07, "loss": 0.0001, "num_tokens": 2805477.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.12247388064861298, "kl": 0.055129002779722214, "learning_rate": 3.3388888888888894e-07, "loss": 0.0032, "num_tokens": 2805782.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 174.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.015565677545964718, "kl": 0.03207338321954012, "learning_rate": 3.3333333333333335e-07, "loss": 0.0016, "num_tokens": 2806076.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.015477887354791164, "kl": 0.0028617798816412687, "learning_rate": 3.327777777777778e-07, "loss": 0.0001, "num_tokens": 2806358.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.12364506721496582, "kl": 0.02071167714893818, "learning_rate": 3.322222222222223e-07, "loss": 0.0011, "num_tokens": 2806636.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 174.14814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 6.1129045486450195, "kl": 0.023350203060545027, "learning_rate": 3.316666666666667e-07, "loss": 0.2216, "num_tokens": 2806902.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 174.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.01051697414368391, "kl": 0.07497105002403259, "learning_rate": 3.3111111111111115e-07, "loss": 0.0037, "num_tokens": 2807338.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 174.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.013277443125844002, "kl": 0.0003133907957817428, "learning_rate": 3.3055555555555556e-07, "loss": 0.0, "num_tokens": 2807552.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.044599514454603195, "kl": 0.016339033842086792, "learning_rate": 3.3e-07, "loss": 0.0008, "num_tokens": 2807820.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 174.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.237394094467163, "kl": 0.49357375502586365, "learning_rate": 3.294444444444445e-07, "loss": 0.0579, "num_tokens": 2808151.0, "reward": 1.75, "reward_std": 1.5, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.5, "step": 9408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.25, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 44.25, "completions/mean_terminated_length": 44.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 174.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 1.012171745300293, "kl": 0.116286251693964, "learning_rate": 3.2888888888888894e-07, "loss": -0.0561, "num_tokens": 2808552.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 9409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 174.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.017757898196578026, "kl": 0.0698571503162384, "learning_rate": 3.2833333333333335e-07, "loss": 0.0035, "num_tokens": 2808920.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 174.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0073946500197052956, "kl": 0.0018669366836547852, "learning_rate": 3.277777777777778e-07, "loss": 0.0001, "num_tokens": 2809132.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 174.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.16436271369457245, "kl": 0.020612517138943076, "learning_rate": 3.2722222222222227e-07, "loss": 0.0012, "num_tokens": 2809405.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 174.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.010809493251144886, "kl": 0.2521328032016754, "learning_rate": 3.266666666666667e-07, "loss": 0.0126, "num_tokens": 2809703.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 174.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.14694787561893463, "kl": 0.012183214537799358, "learning_rate": 3.2611111111111114e-07, "loss": 0.0006, "num_tokens": 2809998.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 174.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.02072109282016754, "kl": 0.007137813838198781, "learning_rate": 3.2555555555555555e-07, "loss": 0.0004, "num_tokens": 2810286.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 174.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.011624278500676155, "kl": 0.008225895464420319, "learning_rate": 3.25e-07, "loss": 0.0004, "num_tokens": 2810522.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 86.75, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 86.75, "completions/mean_terminated_length": 30.33333396911621, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 174.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.8445641994476318, "kl": 0.11280851066112518, "learning_rate": 3.2444444444444447e-07, "loss": 0.3931, "num_tokens": 2811093.0, "reward": 6.050000190734863, "reward_std": 3.9000003337860107, "rewards/reward_combined/mean": 6.050000190734863, "rewards/reward_combined/std": 3.9000000953674316, "step": 9417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 174.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.13516117632389069, "kl": 0.041164327412843704, "learning_rate": 3.2388888888888893e-07, "loss": 0.0021, "num_tokens": 2811397.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 174.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.011499704793095589, "kl": 0.003112420439720154, "learning_rate": 3.2333333333333334e-07, "loss": 0.0002, "num_tokens": 2811605.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 174.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03144095093011856, "kl": 0.00645118672400713, "learning_rate": 3.227777777777778e-07, "loss": 0.0003, "num_tokens": 2811925.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 174.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.006081536877900362, "kl": 0.0005919113755226135, "learning_rate": 3.2222222222222227e-07, "loss": 0.0, "num_tokens": 2812144.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 174.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.027005519717931747, "kl": 0.4444723427295685, "learning_rate": 3.2166666666666673e-07, "loss": 0.0222, "num_tokens": 2812428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 174.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.9734370708465576, "kl": 0.20336931943893433, "learning_rate": 3.2111111111111114e-07, "loss": -0.051, "num_tokens": 2812752.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 174.5185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.918597936630249, "kl": 0.4003644287586212, "learning_rate": 3.2055555555555555e-07, "loss": 0.0203, "num_tokens": 2813122.0, "reward": 7.75, "reward_std": 0.28867512941360474, "rewards/reward_combined/mean": 7.75, "rewards/reward_combined/std": 0.28867512941360474, "step": 9424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 174.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03532989323139191, "kl": 0.003480578539893031, "learning_rate": 3.2e-07, "loss": 0.0002, "num_tokens": 2813435.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 174.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.07787521928548813, "kl": 0.2814785838127136, "learning_rate": 3.1944444444444447e-07, "loss": 0.0141, "num_tokens": 2813724.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 174.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.021780483424663544, "kl": 0.013765878044068813, "learning_rate": 3.1888888888888893e-07, "loss": 0.0007, "num_tokens": 2813984.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 174.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03634270280599594, "kl": 0.007324218982830644, "learning_rate": 3.1833333333333334e-07, "loss": 0.0004, "num_tokens": 2814256.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 174.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.011972697451710701, "kl": 0.003158785868436098, "learning_rate": 3.177777777777778e-07, "loss": 0.0001, "num_tokens": 2814524.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 174.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04859980195760727, "kl": 0.020251546055078506, "learning_rate": 3.1722222222222226e-07, "loss": 0.0009, "num_tokens": 2814774.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.00941474363207817, "kl": 0.0021417071111500263, "learning_rate": 3.166666666666667e-07, "loss": 0.0001, "num_tokens": 2815051.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 174.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 2.2627593352808617e-05, "kl": 4.567205905914307e-06, "learning_rate": 3.1611111111111113e-07, "loss": 0.0, "num_tokens": 2815271.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 174.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06303645670413971, "kl": 0.1167253851890564, "learning_rate": 3.155555555555556e-07, "loss": 0.0058, "num_tokens": 2815622.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 174.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0708339512348175, "kl": 0.02573824574938044, "learning_rate": 3.15e-07, "loss": 0.0013, "num_tokens": 2815913.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 174.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.043900568038225174, "kl": 0.011257955804467201, "learning_rate": 3.1444444444444446e-07, "loss": 0.0006, "num_tokens": 2816242.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9435 }, { "clip_ratio/high_max": 0.004629629664123058, "clip_ratio/high_mean": 0.004629629664123058, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004629629664123058, "completion_length": 41.25, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 41.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 174.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 1.5966063737869263, "kl": 0.04187890887260437, "learning_rate": 3.138888888888889e-07, "loss": 0.0201, "num_tokens": 2816627.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 174.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.07008206844329834, "kl": 0.02513787802308798, "learning_rate": 3.1333333333333333e-07, "loss": 0.0012, "num_tokens": 2816923.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 174.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.014295341446995735, "kl": 0.004360601888038218, "learning_rate": 3.127777777777778e-07, "loss": 0.0002, "num_tokens": 2817225.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 174.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.020556095987558365, "kl": 0.0011889984307345003, "learning_rate": 3.1222222222222226e-07, "loss": 0.0001, "num_tokens": 2817460.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 174.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.2863640785217285, "kl": 0.1603342369198799, "learning_rate": 3.1166666666666666e-07, "loss": 0.0849, "num_tokens": 2817773.0, "reward": 4.375, "reward_std": 4.75, "rewards/reward_combined/mean": 4.375, "rewards/reward_combined/std": 4.75, "step": 9440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 174.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.018885551020503044, "kl": 0.045566242188215256, "learning_rate": 3.111111111111111e-07, "loss": 0.0023, "num_tokens": 2818233.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 72.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 72.0, "completions/mean_terminated_length": 72.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 174.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.4627089500427246, "kl": 0.019485909026116133, "learning_rate": 3.105555555555556e-07, "loss": 0.46, "num_tokens": 2818741.0, "reward": 7.0, "reward_std": 1.0, "rewards/reward_combined/mean": 7.0, "rewards/reward_combined/std": 1.0, "step": 9442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 174.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.030800603330135345, "kl": 0.011152718681842089, "learning_rate": 3.1000000000000005e-07, "loss": 0.0006, "num_tokens": 2819055.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 174.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.010887367650866508, "kl": 0.0031190700829029083, "learning_rate": 3.0944444444444446e-07, "loss": 0.0002, "num_tokens": 2819315.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 174.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.8014562129974365, "kl": 0.14527227729558945, "learning_rate": 3.088888888888889e-07, "loss": 0.2566, "num_tokens": 2819633.0, "reward": 4.5, "reward_std": 4.041451930999756, "rewards/reward_combined/mean": 4.5, "rewards/reward_combined/std": 4.041451930999756, "step": 9445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 174.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04829849302768707, "kl": 0.011952871456742287, "learning_rate": 3.083333333333334e-07, "loss": 0.0006, "num_tokens": 2819905.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 174.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.07993665337562561, "kl": 0.18050439655780792, "learning_rate": 3.077777777777778e-07, "loss": 0.009, "num_tokens": 2820214.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 174.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.007017338648438454, "kl": 0.0001660019188420847, "learning_rate": 3.0722222222222225e-07, "loss": 0.0, "num_tokens": 2820470.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 174.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 4.861217498779297, "kl": 0.07954345643520355, "learning_rate": 3.0666666666666666e-07, "loss": 0.2295, "num_tokens": 2820717.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 9449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 175.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.400315523147583, "kl": 0.1498376140370965, "learning_rate": 3.061111111111111e-07, "loss": -0.0032, "num_tokens": 2821048.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 175.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.007692775223404169, "kl": 0.0015703439712524414, "learning_rate": 3.055555555555556e-07, "loss": 0.0001, "num_tokens": 2821260.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.5, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 175.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.03422713279724121, "kl": 0.012302793329581618, "learning_rate": 3.0500000000000004e-07, "loss": 0.0006, "num_tokens": 2821682.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.06620452553033829, "kl": 0.008989153429865837, "learning_rate": 3.0444444444444445e-07, "loss": 0.0004, "num_tokens": 2821955.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.027034055441617966, "kl": 0.0046475413255393505, "learning_rate": 3.038888888888889e-07, "loss": 0.0002, "num_tokens": 2822255.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 175.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.022340742871165276, "kl": 0.0021631367271766067, "learning_rate": 3.033333333333334e-07, "loss": 0.0001, "num_tokens": 2822527.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 175.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.12415963411331177, "kl": 0.10163168981671333, "learning_rate": 3.027777777777778e-07, "loss": 0.0051, "num_tokens": 2822837.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 175.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.020784247666597366, "kl": 0.0014080136897973716, "learning_rate": 3.0222222222222225e-07, "loss": 0.0001, "num_tokens": 2823097.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 175.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.7938361763954163, "kl": 0.1027665352448821, "learning_rate": 3.016666666666667e-07, "loss": 0.0059, "num_tokens": 2823428.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 175.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 3.551298505044542e-05, "kl": 4.2244791984558105e-06, "learning_rate": 3.0111111111111117e-07, "loss": 0.0, "num_tokens": 2823648.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 175.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06745284050703049, "kl": 0.0031813604291528463, "learning_rate": 3.005555555555556e-07, "loss": 0.0002, "num_tokens": 2823870.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 175.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.02608591876924038, "kl": 0.444695845246315, "learning_rate": 3.0000000000000004e-07, "loss": 0.0222, "num_tokens": 2824154.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 175.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.16464108228683472, "kl": 0.021829976816661656, "learning_rate": 2.9944444444444445e-07, "loss": 0.0011, "num_tokens": 2824442.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 175.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.052849043160676956, "kl": 0.09749055281281471, "learning_rate": 2.988888888888889e-07, "loss": 0.0048, "num_tokens": 2824842.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 175.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.007054791320115328, "kl": 0.00016101002256618813, "learning_rate": 2.9833333333333337e-07, "loss": 0.0, "num_tokens": 2825098.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 175.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.012479757890105247, "kl": 0.002249030047096312, "learning_rate": 2.977777777777778e-07, "loss": 0.0001, "num_tokens": 2825354.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 87.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 87.0, "completions/mean_terminated_length": 30.666667938232422, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 175.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.8664942979812622, "kl": 0.109079260379076, "learning_rate": 2.9722222222222224e-07, "loss": 0.3949, "num_tokens": 2825938.0, "reward": 5.425000190734863, "reward_std": 4.149999618530273, "rewards/reward_combined/mean": 5.425000190734863, "rewards/reward_combined/std": 4.150000095367432, "step": 9466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 175.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05562755838036537, "kl": 0.08285035379230976, "learning_rate": 2.966666666666667e-07, "loss": 0.004, "num_tokens": 2826302.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 175.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06413517147302628, "kl": 0.020808065310120583, "learning_rate": 2.9611111111111116e-07, "loss": 0.0011, "num_tokens": 2826600.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.751708507537842, "kl": 0.051589321345090866, "learning_rate": 2.9555555555555557e-07, "loss": 0.1102, "num_tokens": 2826890.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9469 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.011904762126505375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011904762126505375, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 175.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 1.9137042760849, "kl": 0.02860763855278492, "learning_rate": 2.9500000000000003e-07, "loss": -0.019, "num_tokens": 2827223.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 175.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.036826301366090775, "kl": 0.007457676809281111, "learning_rate": 2.9444444444444444e-07, "loss": 0.0004, "num_tokens": 2827550.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 175.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.03065416030585766, "kl": 0.0017100800178013742, "learning_rate": 2.938888888888889e-07, "loss": 0.0001, "num_tokens": 2827814.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 175.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.734456777572632, "kl": 0.6116060093045235, "learning_rate": 2.9333333333333337e-07, "loss": -0.1822, "num_tokens": 2828134.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.021164163947105408, "kl": 0.007730983663350344, "learning_rate": 2.927777777777778e-07, "loss": 0.0004, "num_tokens": 2828429.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 175.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02287379652261734, "kl": 0.002568377647548914, "learning_rate": 2.9222222222222224e-07, "loss": 0.0001, "num_tokens": 2828702.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 175.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.1335807889699936, "kl": 0.23899124562740326, "learning_rate": 2.916666666666667e-07, "loss": 0.011, "num_tokens": 2829014.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 175.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02365153096616268, "kl": 0.010400008410215378, "learning_rate": 2.9111111111111116e-07, "loss": 0.0005, "num_tokens": 2829326.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 51.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 175.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009370974265038967, "kl": 0.03989647328853607, "learning_rate": 2.9055555555555557e-07, "loss": 0.002, "num_tokens": 2829810.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 175.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.3734356760978699, "kl": 0.20767981559038162, "learning_rate": 2.9000000000000003e-07, "loss": 0.0106, "num_tokens": 2830104.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 175.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.029239851981401443, "kl": 0.0099889962002635, "learning_rate": 2.8944444444444444e-07, "loss": 0.0005, "num_tokens": 2830377.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 175.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.2719228267669678, "kl": 0.10367269068956375, "learning_rate": 2.888888888888889e-07, "loss": 0.0743, "num_tokens": 2830756.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 175.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04468347132205963, "kl": 0.004908253904432058, "learning_rate": 2.8833333333333336e-07, "loss": 0.0003, "num_tokens": 2831001.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01904965005815029, "kl": 0.002965211868286133, "learning_rate": 2.8777777777777777e-07, "loss": 0.0001, "num_tokens": 2831285.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 175.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02272944711148739, "kl": 0.013627502135932446, "learning_rate": 2.8722222222222223e-07, "loss": 0.0007, "num_tokens": 2831545.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 175.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.017253456637263298, "kl": 0.0013396568829193711, "learning_rate": 2.866666666666667e-07, "loss": 0.0001, "num_tokens": 2831781.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9485 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.008333333767950535, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "completion_length": 27.75, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.75, "completions/mean_terminated_length": 27.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 175.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.6379380226135254, "kl": 0.19452311843633652, "learning_rate": 2.8611111111111115e-07, "loss": 0.1156, "num_tokens": 2832108.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 175.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.005234588403254747, "kl": 0.0031288377940654755, "learning_rate": 2.8555555555555556e-07, "loss": 0.0002, "num_tokens": 2832368.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 175.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 8.427495956420898, "kl": 0.15270660258829594, "learning_rate": 2.85e-07, "loss": 0.1916, "num_tokens": 2832601.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 9488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.75, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.75, "completions/mean_terminated_length": 32.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 175.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 1.243149995803833, "kl": 0.11983935534954071, "learning_rate": 2.844444444444445e-07, "loss": -0.0427, "num_tokens": 2832984.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 9489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 175.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.044410284608602524, "kl": 0.03081304393708706, "learning_rate": 2.8388888888888895e-07, "loss": 0.0016, "num_tokens": 2833276.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 175.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 2.5411410331726074, "kl": 0.12314309552311897, "learning_rate": 2.8333333333333336e-07, "loss": 0.022, "num_tokens": 2833632.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 175.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01862994208931923, "kl": 0.0020479485392570496, "learning_rate": 2.8277777777777776e-07, "loss": 0.0001, "num_tokens": 2833840.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 175.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.029125111177563667, "kl": 0.005820946302264929, "learning_rate": 2.822222222222222e-07, "loss": 0.0003, "num_tokens": 2834168.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 175.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.33224260807037354, "kl": 0.07981178537011147, "learning_rate": 2.816666666666667e-07, "loss": 0.004, "num_tokens": 2834464.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 175.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.09788110852241516, "kl": 0.02812346164137125, "learning_rate": 2.8111111111111115e-07, "loss": 0.0015, "num_tokens": 2834763.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 175.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.017436876893043518, "kl": 0.001961236004717648, "learning_rate": 2.8055555555555556e-07, "loss": 0.0001, "num_tokens": 2835082.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 175.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.13731761276721954, "kl": 0.06020618975162506, "learning_rate": 2.8e-07, "loss": 0.0036, "num_tokens": 2835398.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 175.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.01297926064580679, "kl": 0.0011165559408254921, "learning_rate": 2.794444444444445e-07, "loss": 0.0001, "num_tokens": 2835712.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 175.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.015236172825098038, "kl": 0.006405138410627842, "learning_rate": 2.7888888888888894e-07, "loss": 0.0003, "num_tokens": 2836003.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 175.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.014486769214272499, "kl": 0.0004436671733856201, "learning_rate": 2.7833333333333335e-07, "loss": 0.0, "num_tokens": 2836215.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 175.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.021434741094708443, "kl": 0.22218672186136246, "learning_rate": 2.7777777777777776e-07, "loss": 0.0111, "num_tokens": 2836518.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 175.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.05643043667078018, "kl": 0.17152293771505356, "learning_rate": 2.772222222222222e-07, "loss": 0.0086, "num_tokens": 2836829.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 175.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 5.003294944763184, "kl": 0.09406627621501684, "learning_rate": 2.766666666666667e-07, "loss": 0.1463, "num_tokens": 2837117.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.011481480672955513, "kl": 0.008253030478954315, "learning_rate": 2.7611111111111114e-07, "loss": 0.0004, "num_tokens": 2837353.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 176.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.063280388712883, "kl": 0.014118686318397522, "learning_rate": 2.7555555555555555e-07, "loss": 0.0007, "num_tokens": 2837622.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 176.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.017891405150294304, "kl": 0.003726729308255017, "learning_rate": 2.75e-07, "loss": 0.0002, "num_tokens": 2837890.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 176.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 1.177750825881958, "kl": 0.08224209025502205, "learning_rate": 2.744444444444445e-07, "loss": -0.1433, "num_tokens": 2838265.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 176.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.041159152984619, "kl": 0.0556532246991992, "learning_rate": 2.7388888888888894e-07, "loss": 0.1132, "num_tokens": 2838548.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 176.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.027019966393709183, "kl": 0.0023477572249248624, "learning_rate": 2.7333333333333335e-07, "loss": 0.0001, "num_tokens": 2838808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.01217845268547535, "kl": 0.008059442043304443, "learning_rate": 2.727777777777778e-07, "loss": 0.0004, "num_tokens": 2839044.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 176.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.017929377034306526, "kl": 0.0014343329821713269, "learning_rate": 2.722222222222222e-07, "loss": 0.0001, "num_tokens": 2839358.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 176.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.029079139232635498, "kl": 0.002687810978386551, "learning_rate": 2.716666666666667e-07, "loss": 0.0001, "num_tokens": 2839658.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 176.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.02235308662056923, "kl": 0.013674428686499596, "learning_rate": 2.7111111111111114e-07, "loss": 0.0007, "num_tokens": 2839918.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 176.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.16394595801830292, "kl": 0.03327193181030452, "learning_rate": 2.7055555555555555e-07, "loss": 0.0018, "num_tokens": 2840241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 176.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.19027122855186462, "kl": 0.03277803532546386, "learning_rate": 2.7e-07, "loss": 0.0019, "num_tokens": 2840480.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 176.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.24035559594631195, "kl": 0.15953946858644485, "learning_rate": 2.6944444444444447e-07, "loss": 0.0075, "num_tokens": 2840786.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 176.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03725803643465042, "kl": 0.030751315876841545, "learning_rate": 2.6888888888888893e-07, "loss": 0.0016, "num_tokens": 2841077.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 176.25925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 2.999216318130493, "kl": 0.0991252064704895, "learning_rate": 2.6833333333333334e-07, "loss": -0.0344, "num_tokens": 2841384.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 176.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.032519545406103134, "kl": 0.012978303246200085, "learning_rate": 2.677777777777778e-07, "loss": 0.0007, "num_tokens": 2841726.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 176.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.359875440597534, "kl": 0.01916984561830759, "learning_rate": 2.6722222222222226e-07, "loss": 0.1284, "num_tokens": 2842102.0, "reward": 6.75, "reward_std": 1.5, "rewards/reward_combined/mean": 6.75, "rewards/reward_combined/std": 1.5, "step": 9520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 176.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.013968772254884243, "kl": 0.006323275389149785, "learning_rate": 2.666666666666667e-07, "loss": 0.0003, "num_tokens": 2842375.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 176.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06466874480247498, "kl": 0.008266122546046972, "learning_rate": 2.6611111111111113e-07, "loss": 0.0004, "num_tokens": 2842666.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 176.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.012294154614210129, "kl": 0.041777245700359344, "learning_rate": 2.6555555555555554e-07, "loss": 0.0021, "num_tokens": 2843142.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 176.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03531062975525856, "kl": 0.04022184433415532, "learning_rate": 2.65e-07, "loss": 0.002, "num_tokens": 2843434.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 176.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007452598074451089, "kl": 0.0017023789114318788, "learning_rate": 2.6444444444444447e-07, "loss": 0.0001, "num_tokens": 2843714.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 176.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04235989600419998, "kl": 0.01714207883924246, "learning_rate": 2.6388888888888893e-07, "loss": 0.0009, "num_tokens": 2843982.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 176.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0747215747833252, "kl": 0.0028883665800094604, "learning_rate": 2.6333333333333334e-07, "loss": 0.0001, "num_tokens": 2844194.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 176.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.2192646712064743, "kl": 0.05278550274670124, "learning_rate": 2.627777777777778e-07, "loss": 0.0027, "num_tokens": 2844526.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 176.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.1797914057970047, "kl": 0.017255255952477455, "learning_rate": 2.6222222222222226e-07, "loss": 0.001, "num_tokens": 2844774.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 176.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.006195253226906061, "kl": 0.22639156877994537, "learning_rate": 2.616666666666667e-07, "loss": 0.0113, "num_tokens": 2845076.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 176.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.1258293092250824, "kl": 0.15034300833940506, "learning_rate": 2.6111111111111113e-07, "loss": 0.0076, "num_tokens": 2845422.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 176.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00229882774874568, "kl": 0.28220096230506897, "learning_rate": 2.6055555555555554e-07, "loss": 0.0141, "num_tokens": 2845710.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 176.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.05328376591205597, "kl": 0.015801661647856236, "learning_rate": 2.6e-07, "loss": 0.0011, "num_tokens": 2846025.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9533 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 176.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 4.512730121612549, "kl": 0.09289288148283958, "learning_rate": 2.5944444444444446e-07, "loss": 0.0271, "num_tokens": 2846317.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 176.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.830061912536621, "kl": 0.16524893790483475, "learning_rate": 2.588888888888889e-07, "loss": 0.0919, "num_tokens": 2846632.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 176.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 4.166101455688477, "kl": 0.11684366315603256, "learning_rate": 2.5833333333333333e-07, "loss": 0.042, "num_tokens": 2846970.0, "reward": 5.25, "reward_std": 5.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 5.5, "step": 9536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 176.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05561874061822891, "kl": 0.02098531648516655, "learning_rate": 2.577777777777778e-07, "loss": 0.0011, "num_tokens": 2847304.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 176.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.004093245603144169, "kl": 0.00022166966664372012, "learning_rate": 2.5722222222222225e-07, "loss": 0.0, "num_tokens": 2847560.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 176.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011031086556613445, "kl": 0.008098291233181953, "learning_rate": 2.566666666666667e-07, "loss": 0.0004, "num_tokens": 2847872.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 176.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.01756414957344532, "kl": 0.4377230107784271, "learning_rate": 2.561111111111111e-07, "loss": 0.0219, "num_tokens": 2848156.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 176.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.45906925201416016, "kl": 0.04319735802710056, "learning_rate": 2.555555555555556e-07, "loss": 0.0021, "num_tokens": 2848488.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 176.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04765874147415161, "kl": 0.006169597152620554, "learning_rate": 2.55e-07, "loss": 0.0003, "num_tokens": 2848763.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 176.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 0.6507717370986938, "kl": 0.31971051916480064, "learning_rate": 2.5444444444444446e-07, "loss": -0.0914, "num_tokens": 2849161.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 176.74074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 4.593164443969727, "kl": 0.08790410996880382, "learning_rate": 2.538888888888889e-07, "loss": 0.0283, "num_tokens": 2849446.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 5.7215562264900655e-05, "kl": 5.356967449188232e-06, "learning_rate": 2.533333333333333e-07, "loss": 0.0, "num_tokens": 2849666.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06070149689912796, "kl": 0.026240098290145397, "learning_rate": 2.527777777777778e-07, "loss": 0.0013, "num_tokens": 2849892.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.75, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 176.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.0206642150878906, "kl": 0.012034661485813558, "learning_rate": 2.5222222222222225e-07, "loss": 0.4106, "num_tokens": 2850287.0, "reward": 7.5, "reward_std": 1.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 1.0, "step": 9547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 176.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 13.541470527648926, "kl": 0.11083927750587463, "learning_rate": 2.516666666666667e-07, "loss": 0.3111, "num_tokens": 2850497.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 176.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.004265711642801762, "kl": 0.0032174773514270782, "learning_rate": 2.511111111111111e-07, "loss": 0.0002, "num_tokens": 2850757.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 176.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.28446704149246216, "kl": 0.11001503467559814, "learning_rate": 2.505555555555556e-07, "loss": 0.0055, "num_tokens": 2851111.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 176.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.004244046751409769, "kl": 0.0006144344806671143, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "num_tokens": 2851331.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 176.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.034620027989149094, "kl": 0.003450308460742235, "learning_rate": 2.494444444444445e-07, "loss": 0.0002, "num_tokens": 2851655.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 176.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.04260234534740448, "kl": 0.02541881985962391, "learning_rate": 2.488888888888889e-07, "loss": 0.0011, "num_tokens": 2852021.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.005193985532969236, "kl": 0.0002915114164352417, "learning_rate": 2.483333333333333e-07, "loss": 0.0, "num_tokens": 2852233.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 176.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.018965035676956177, "kl": 0.004420254030264914, "learning_rate": 2.477777777777778e-07, "loss": 0.0002, "num_tokens": 2852525.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 176.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.015229009091854095, "kl": 0.0009080991148948669, "learning_rate": 2.4722222222222224e-07, "loss": 0.0, "num_tokens": 2852779.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 176.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.2009212970733643, "kl": 0.2597320508211851, "learning_rate": 2.466666666666667e-07, "loss": 0.0252, "num_tokens": 2853077.0, "reward": 5.0, "reward_std": 5.0, "rewards/reward_combined/mean": 5.0, "rewards/reward_combined/std": 5.0, "step": 9557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 177.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.8532077074050903, "kl": 0.19146699458360672, "learning_rate": 2.461111111111111e-07, "loss": 0.0101, "num_tokens": 2853434.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 177.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.009549936279654503, "kl": 0.0059322454035282135, "learning_rate": 2.455555555555556e-07, "loss": 0.0003, "num_tokens": 2853746.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 177.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.07668053358793259, "kl": 0.007610459346324205, "learning_rate": 2.4500000000000004e-07, "loss": 0.0005, "num_tokens": 2853997.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 177.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.013940486125648022, "kl": 0.004645922454074025, "learning_rate": 2.444444444444445e-07, "loss": 0.0002, "num_tokens": 2854285.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.75, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 34.75, "completions/mean_terminated_length": 34.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 177.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05931701138615608, "kl": 0.0688610877841711, "learning_rate": 2.438888888888889e-07, "loss": 0.0034, "num_tokens": 2854652.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 177.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0342312753200531, "kl": 0.0017276882863370702, "learning_rate": 2.433333333333333e-07, "loss": 0.0001, "num_tokens": 2854908.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 177.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.034195173531770706, "kl": 0.04051907919347286, "learning_rate": 2.427777777777778e-07, "loss": 0.002, "num_tokens": 2855201.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 177.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.08915529400110245, "kl": 0.030659226700663567, "learning_rate": 2.4222222222222224e-07, "loss": 0.0015, "num_tokens": 2855498.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 177.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.046062640845775604, "kl": 0.00357311824336648, "learning_rate": 2.416666666666667e-07, "loss": 0.0002, "num_tokens": 2855762.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 177.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.035302046686410904, "kl": 0.17272794991731644, "learning_rate": 2.411111111111111e-07, "loss": 0.0086, "num_tokens": 2856072.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 177.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.04724346473813057, "kl": 0.01238948805257678, "learning_rate": 2.4055555555555557e-07, "loss": 0.0006, "num_tokens": 2856345.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 177.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.018047362565994263, "kl": 0.0033527729101479053, "learning_rate": 2.4000000000000003e-07, "loss": 0.0002, "num_tokens": 2856650.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 177.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.1587844043970108, "kl": 0.10417036712169647, "learning_rate": 2.394444444444445e-07, "loss": 0.0048, "num_tokens": 2857062.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 177.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.004416433162987232, "kl": 0.0006586491945199668, "learning_rate": 2.388888888888889e-07, "loss": 0.0, "num_tokens": 2857282.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 177.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.06745550781488419, "kl": 0.11959455907344818, "learning_rate": 2.3833333333333336e-07, "loss": 0.0061, "num_tokens": 2857602.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 177.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.02579745650291443, "kl": 0.0023519322276115417, "learning_rate": 2.3777777777777777e-07, "loss": 0.0001, "num_tokens": 2857862.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 177.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03819455951452255, "kl": 0.008495406713336706, "learning_rate": 2.3722222222222223e-07, "loss": 0.0004, "num_tokens": 2858178.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 177.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 3.683716931845993e-05, "kl": 4.641711711883545e-06, "learning_rate": 2.3666666666666667e-07, "loss": 0.0, "num_tokens": 2858398.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 177.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 3.0088720321655273, "kl": 0.17618703097105026, "learning_rate": 2.3611111111111113e-07, "loss": -0.0381, "num_tokens": 2858730.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 177.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09424345195293427, "kl": 0.03641865774989128, "learning_rate": 2.3555555555555556e-07, "loss": 0.002, "num_tokens": 2859002.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 177.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 8.042472839355469, "kl": 2.3904250748455524, "learning_rate": 2.3500000000000003e-07, "loss": -0.0003, "num_tokens": 2859259.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 177.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.09247192740440369, "kl": 0.023068408481776714, "learning_rate": 2.3444444444444446e-07, "loss": 0.0012, "num_tokens": 2859543.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 177.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 4.696101188659668, "kl": 0.10200847359374166, "learning_rate": 2.3388888888888892e-07, "loss": -0.0763, "num_tokens": 2859812.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 177.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.010543666779994965, "kl": 0.004823621740797535, "learning_rate": 2.3333333333333336e-07, "loss": 0.0002, "num_tokens": 2860072.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 177.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 4.529871463775635, "kl": 0.4127971976995468, "learning_rate": 2.3277777777777782e-07, "loss": 0.0122, "num_tokens": 2860379.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 177.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02970034070312977, "kl": 0.0061189099214971066, "learning_rate": 2.3222222222222223e-07, "loss": 0.0003, "num_tokens": 2860681.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 177.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.007671392522752285, "kl": 0.001732468605041504, "learning_rate": 2.3166666666666666e-07, "loss": 0.0001, "num_tokens": 2860893.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 177.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023358711041510105, "kl": 0.2822130173444748, "learning_rate": 2.3111111111111112e-07, "loss": 0.0141, "num_tokens": 2861181.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 177.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.012424352578818798, "kl": 0.002899685874581337, "learning_rate": 2.3055555555555556e-07, "loss": 0.0001, "num_tokens": 2861465.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 177.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 7.993619441986084, "kl": 0.0362096456810832, "learning_rate": 2.3000000000000002e-07, "loss": -0.1129, "num_tokens": 2861730.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 9587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 177.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.015504240989685059, "kl": 0.0029397570760920644, "learning_rate": 2.2944444444444446e-07, "loss": 0.0001, "num_tokens": 2862000.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 177.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.017873184755444527, "kl": 0.0015359798562712967, "learning_rate": 2.2888888888888892e-07, "loss": 0.0001, "num_tokens": 2862314.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 177.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03173932060599327, "kl": 0.050825467333197594, "learning_rate": 2.2833333333333335e-07, "loss": 0.0025, "num_tokens": 2862766.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 177.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03914560005068779, "kl": 0.002700008451938629, "learning_rate": 2.2777777777777781e-07, "loss": 0.0001, "num_tokens": 2862976.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 177.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 10.90402889251709, "kl": 0.16211431100964546, "learning_rate": 2.2722222222222225e-07, "loss": 0.1323, "num_tokens": 2863240.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 177.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.1321442127227783, "kl": 0.034173902589827776, "learning_rate": 2.266666666666667e-07, "loss": -0.088, "num_tokens": 2863623.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 177.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.222186326980591, "kl": 0.09309851983562112, "learning_rate": 2.2611111111111112e-07, "loss": 0.1821, "num_tokens": 2863974.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 177.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05546688288450241, "kl": 0.0050855702720582485, "learning_rate": 2.2555555555555555e-07, "loss": 0.0003, "num_tokens": 2864304.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 177.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.037198543548583984, "kl": 0.0006291121244430542, "learning_rate": 2.2500000000000002e-07, "loss": 0.0, "num_tokens": 2864516.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 177.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.04062540829181671, "kl": 0.43112926185131073, "learning_rate": 2.2444444444444445e-07, "loss": 0.0216, "num_tokens": 2864800.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 177.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.08045687526464462, "kl": 0.01900443248450756, "learning_rate": 2.238888888888889e-07, "loss": 0.0009, "num_tokens": 2865129.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 9.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 177.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.09286394715309143, "kl": 0.025446875020861626, "learning_rate": 2.2333333333333335e-07, "loss": 0.0012, "num_tokens": 2865368.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 91.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 91.5, "completions/mean_terminated_length": 36.66666793823242, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 177.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.7586008310317993, "kl": 0.13926365971565247, "learning_rate": 2.227777777777778e-07, "loss": 0.3882, "num_tokens": 2865986.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 177.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.005586786195635796, "kl": 0.0018955306150019169, "learning_rate": 2.2222222222222224e-07, "loss": 0.0001, "num_tokens": 2866268.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 177.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.11324873566627502, "kl": 0.020495938137173653, "learning_rate": 2.216666666666667e-07, "loss": 0.001, "num_tokens": 2866617.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 177.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02089407667517662, "kl": 0.0010022428468801081, "learning_rate": 2.2111111111111114e-07, "loss": 0.0001, "num_tokens": 2866852.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 177.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.13373611867427826, "kl": 0.05236709304153919, "learning_rate": 2.2055555555555555e-07, "loss": 0.0026, "num_tokens": 2867146.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 177.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 3.8617281913757324, "kl": 0.06383509561419487, "learning_rate": 2.2e-07, "loss": -0.0443, "num_tokens": 2867482.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 177.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.042490266263484955, "kl": 0.008178325020708144, "learning_rate": 2.1944444444444445e-07, "loss": 0.0004, "num_tokens": 2867811.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.25, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 35.25, "completions/mean_terminated_length": 35.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 177.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 4.266460418701172, "kl": 0.07170997560024261, "learning_rate": 2.188888888888889e-07, "loss": 0.1855, "num_tokens": 2868188.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 177.92592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 6.2611308097839355, "kl": 0.03819169173948467, "learning_rate": 2.1833333333333334e-07, "loss": 0.2007, "num_tokens": 2868498.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 177.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 2.5543787479400635, "kl": 0.7559559196233749, "learning_rate": 2.177777777777778e-07, "loss": -0.0007, "num_tokens": 2868785.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 9609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.5, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 177.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.1728615760803223, "kl": 0.14864064380526543, "learning_rate": 2.1722222222222224e-07, "loss": -0.0261, "num_tokens": 2869163.0, "reward": 2.125, "reward_std": 4.2303466796875, "rewards/reward_combined/mean": 2.125, "rewards/reward_combined/std": 4.2303466796875, "step": 9610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 177.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07449386268854141, "kl": 0.23743876814842224, "learning_rate": 2.166666666666667e-07, "loss": 0.0118, "num_tokens": 2869464.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.011802761815488338, "kl": 0.00819879025220871, "learning_rate": 2.1611111111111114e-07, "loss": 0.0004, "num_tokens": 2869700.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.019172728061676025, "kl": 0.0009688980644568801, "learning_rate": 2.155555555555556e-07, "loss": 0.0, "num_tokens": 2869988.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 178.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.704712152481079, "kl": 0.1337011158466339, "learning_rate": 2.15e-07, "loss": -0.0225, "num_tokens": 2870354.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 178.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.09008818119764328, "kl": 0.016701585613191128, "learning_rate": 2.1444444444444444e-07, "loss": 0.0008, "num_tokens": 2870638.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 178.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 3.071286678314209, "kl": 0.06776305008679628, "learning_rate": 2.138888888888889e-07, "loss": 0.0098, "num_tokens": 2870989.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 178.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0346197672188282, "kl": 0.007836889009922743, "learning_rate": 2.1333333333333334e-07, "loss": 0.0004, "num_tokens": 2871277.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 178.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.018469106405973434, "kl": 0.04257126338779926, "learning_rate": 2.127777777777778e-07, "loss": 0.0021, "num_tokens": 2871753.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 178.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.010806597769260406, "kl": 0.0031583309173583984, "learning_rate": 2.1222222222222223e-07, "loss": 0.0002, "num_tokens": 2872013.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 178.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07282155007123947, "kl": 0.01557326689362526, "learning_rate": 2.116666666666667e-07, "loss": 0.0008, "num_tokens": 2872339.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 3.1524157524108887, "kl": 0.11601917445659637, "learning_rate": 2.1111111111111113e-07, "loss": 0.0635, "num_tokens": 2872624.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9621 }, { "clip_ratio/high_max": 0.008474576286971569, "clip_ratio/high_mean": 0.008474576286971569, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008474576286971569, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 178.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.5174951553344727, "kl": 0.0991615578532219, "learning_rate": 2.105555555555556e-07, "loss": -0.1703, "num_tokens": 2873011.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 178.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.024900399148464203, "kl": 0.008613965474069118, "learning_rate": 2.1000000000000003e-07, "loss": 0.0004, "num_tokens": 2873312.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 178.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06398960202932358, "kl": 0.13773895800113678, "learning_rate": 2.094444444444445e-07, "loss": 0.0068, "num_tokens": 2873622.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03263575956225395, "kl": 0.006894601974636316, "learning_rate": 2.088888888888889e-07, "loss": 0.0003, "num_tokens": 2873894.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 178.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.009694814682006836, "kl": 0.22662892937660217, "learning_rate": 2.0833333333333333e-07, "loss": 0.0113, "num_tokens": 2874196.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 178.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.018549319356679916, "kl": 0.004681834951043129, "learning_rate": 2.077777777777778e-07, "loss": 0.0002, "num_tokens": 2874500.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 178.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006512045511044562, "kl": 0.0001076221524272114, "learning_rate": 2.0722222222222223e-07, "loss": 0.0, "num_tokens": 2874756.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05304213613271713, "kl": 0.0028073315043002367, "learning_rate": 2.066666666666667e-07, "loss": 0.0001, "num_tokens": 2875018.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.006095788441598415, "kl": 0.000561404216568917, "learning_rate": 2.0611111111111113e-07, "loss": 0.0, "num_tokens": 2875237.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 34.25, "completions/mean_terminated_length": 34.25, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 178.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.03323167562484741, "kl": 0.021062446758151054, "learning_rate": 2.055555555555556e-07, "loss": 0.0011, "num_tokens": 2875598.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 178.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.07641091197729111, "kl": 0.019184116972610354, "learning_rate": 2.0500000000000002e-07, "loss": 0.001, "num_tokens": 2875860.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 178.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.08134759962558746, "kl": 0.3485010415315628, "learning_rate": 2.0444444444444448e-07, "loss": 0.0177, "num_tokens": 2876142.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.40740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 6.696260929107666, "kl": 0.06059562414884567, "learning_rate": 2.0388888888888892e-07, "loss": -0.0434, "num_tokens": 2876375.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 9634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 178.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076212347485125065, "kl": 0.0017709285020828247, "learning_rate": 2.0333333333333333e-07, "loss": 0.0001, "num_tokens": 2876587.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 178.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.038550376892089844, "kl": 0.012144434731453657, "learning_rate": 2.027777777777778e-07, "loss": 0.0006, "num_tokens": 2876920.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.015876922756433487, "kl": 0.00291742873378098, "learning_rate": 2.0222222222222222e-07, "loss": 0.0001, "num_tokens": 2877202.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 178.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0508321113884449, "kl": 0.005364958895370364, "learning_rate": 2.0166666666666669e-07, "loss": 0.0003, "num_tokens": 2877520.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 178.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.11847475171089172, "kl": 0.06867517903447151, "learning_rate": 2.0111111111111112e-07, "loss": 0.0035, "num_tokens": 2877827.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 4.6928489609854296e-05, "kl": 5.029141902923584e-06, "learning_rate": 2.0055555555555558e-07, "loss": 0.0, "num_tokens": 2878047.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 178.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.007958855479955673, "kl": 0.008187071420252323, "learning_rate": 2.0000000000000002e-07, "loss": 0.0004, "num_tokens": 2878359.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.026247834786772728, "kl": 0.028446178883314133, "learning_rate": 1.9944444444444448e-07, "loss": 0.0015, "num_tokens": 2878629.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 178.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.022009490057826042, "kl": 0.0011712341220118105, "learning_rate": 1.9888888888888891e-07, "loss": 0.0001, "num_tokens": 2878864.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 178.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 6.941186428070068, "kl": 0.04465697333216667, "learning_rate": 1.9833333333333338e-07, "loss": 0.2468, "num_tokens": 2879140.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 178.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.03468911349773407, "kl": 0.04038281738758087, "learning_rate": 1.9777777777777778e-07, "loss": 0.002, "num_tokens": 2879433.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 178.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03891831636428833, "kl": 0.01298204017803073, "learning_rate": 1.9722222222222222e-07, "loss": 0.0006, "num_tokens": 2879694.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 178.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021926097106188536, "kl": 0.2821848690509796, "learning_rate": 1.9666666666666668e-07, "loss": 0.0141, "num_tokens": 2879982.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 178.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.01752030849456787, "kl": 0.00665585370734334, "learning_rate": 1.9611111111111112e-07, "loss": 0.0003, "num_tokens": 2880300.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 8.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 178.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.05659618601202965, "kl": 0.0037059446331113577, "learning_rate": 1.9555555555555558e-07, "loss": 0.0002, "num_tokens": 2880547.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 178.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 21.459209442138672, "kl": 2.485884750261903, "learning_rate": 1.95e-07, "loss": 0.5045, "num_tokens": 2880919.0, "reward": 4.925000190734863, "reward_std": 3.797696828842163, "rewards/reward_combined/mean": 4.925000190734863, "rewards/reward_combined/std": 3.797696828842163, "step": 9650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 178.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05387621745467186, "kl": 0.019909250549972057, "learning_rate": 1.9444444444444447e-07, "loss": 0.001, "num_tokens": 2881249.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 178.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.024998042732477188, "kl": 0.025505000725388527, "learning_rate": 1.938888888888889e-07, "loss": 0.0011, "num_tokens": 2881627.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 178.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.036353349685668945, "kl": 0.010193950030952692, "learning_rate": 1.9333333333333337e-07, "loss": 0.0005, "num_tokens": 2881925.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.0, "completions/mean_terminated_length": 26.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 178.77777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 3.44446063041687, "kl": 0.09867623075842857, "learning_rate": 1.927777777777778e-07, "loss": -0.0022, "num_tokens": 2882265.0, "reward": 5.25, "reward_std": 4.5, "rewards/reward_combined/mean": 5.25, "rewards/reward_combined/std": 4.5, "step": 9654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 178.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.11967506259679794, "kl": 0.03633320704102516, "learning_rate": 1.9222222222222227e-07, "loss": 0.0018, "num_tokens": 2882562.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 178.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.2684870660305023, "kl": 0.02284238487482071, "learning_rate": 1.9166666666666668e-07, "loss": 0.0011, "num_tokens": 2882858.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 178.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01717764511704445, "kl": 0.0016982131637632847, "learning_rate": 1.911111111111111e-07, "loss": 0.0001, "num_tokens": 2883180.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.025882573798298836, "kl": 0.001044534146785736, "learning_rate": 1.9055555555555557e-07, "loss": 0.0001, "num_tokens": 2883392.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 178.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.03300011157989502, "kl": 0.002771638333797455, "learning_rate": 1.9e-07, "loss": 0.0001, "num_tokens": 2883598.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 12.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 178.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 1.4717718362808228, "kl": 0.24936415418051183, "learning_rate": 1.8944444444444447e-07, "loss": 0.0176, "num_tokens": 2883871.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 178.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.011798275634646416, "kl": 0.008217290043830872, "learning_rate": 1.888888888888889e-07, "loss": 0.0004, "num_tokens": 2884107.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 178.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.013892952352762222, "kl": 0.0021364674903452396, "learning_rate": 1.8833333333333337e-07, "loss": 0.0001, "num_tokens": 2884365.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 178.94444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 4.964146614074707, "kl": 0.19932667911052704, "learning_rate": 1.877777777777778e-07, "loss": 0.1461, "num_tokens": 2884661.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 178.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.012828445993363857, "kl": 0.09835219383239746, "learning_rate": 1.8722222222222226e-07, "loss": 0.0045, "num_tokens": 2885064.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 178.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.054429735988378525, "kl": 0.1431431919336319, "learning_rate": 1.866666666666667e-07, "loss": 0.0071, "num_tokens": 2885398.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 28.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 179.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.07561731338500977, "kl": 0.10335391014814377, "learning_rate": 1.861111111111111e-07, "loss": 0.0053, "num_tokens": 2885729.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 179.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.5150701403617859, "kl": 0.17045461386442184, "learning_rate": 1.8555555555555557e-07, "loss": 0.0085, "num_tokens": 2885965.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 179.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.007622746285051107, "kl": 0.008323662914335728, "learning_rate": 1.85e-07, "loss": 0.0004, "num_tokens": 2886277.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.008822916075587273, "kl": 0.0015167567180469632, "learning_rate": 1.8444444444444446e-07, "loss": 0.0001, "num_tokens": 2886573.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 179.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.008599059656262398, "kl": 0.00017499923706054688, "learning_rate": 1.838888888888889e-07, "loss": 0.0, "num_tokens": 2886786.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 179.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 3.842585325241089, "kl": 0.25686701014637947, "learning_rate": 1.8333333333333336e-07, "loss": 0.0529, "num_tokens": 2887064.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.11111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.391304016113281, "kl": 0.072949742898345, "learning_rate": 1.827777777777778e-07, "loss": 0.0771, "num_tokens": 2887356.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 179.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 4.585651397705078, "kl": 0.05423942720517516, "learning_rate": 1.8222222222222226e-07, "loss": 0.2416, "num_tokens": 2887669.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 179.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009518408915027976, "kl": 0.00015931725283735432, "learning_rate": 1.816666666666667e-07, "loss": 0.0, "num_tokens": 2887925.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 179.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.010355044156312943, "kl": 0.00321294367313385, "learning_rate": 1.8111111111111115e-07, "loss": 0.0002, "num_tokens": 2888185.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 179.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.03378036245703697, "kl": 0.12767730001360178, "learning_rate": 1.8055555555555556e-07, "loss": 0.0064, "num_tokens": 2888515.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 179.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.015520980581641197, "kl": 0.003007391351275146, "learning_rate": 1.8e-07, "loss": 0.0002, "num_tokens": 2888799.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 179.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 3.136218309402466, "kl": 0.032722900621593, "learning_rate": 1.7944444444444446e-07, "loss": 0.3317, "num_tokens": 2889095.0, "reward": 5.875, "reward_std": 3.25, "rewards/reward_combined/mean": 5.875, "rewards/reward_combined/std": 3.25, "step": 9678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 179.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.026440070942044258, "kl": 0.4447540193796158, "learning_rate": 1.788888888888889e-07, "loss": 0.0222, "num_tokens": 2889379.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 179.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.06754489988088608, "kl": 0.039880117401480675, "learning_rate": 1.7833333333333336e-07, "loss": 0.0018, "num_tokens": 2889767.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 179.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.07095512747764587, "kl": 0.024895597249269485, "learning_rate": 1.777777777777778e-07, "loss": 0.0013, "num_tokens": 2890012.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 35.0, "completions/mean_terminated_length": 35.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 179.2962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.3871397972106934, "kl": 0.09962432272732258, "learning_rate": 1.7722222222222225e-07, "loss": -0.1768, "num_tokens": 2890404.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.07485751062631607, "kl": 0.04649446718394756, "learning_rate": 1.766666666666667e-07, "loss": 0.0025, "num_tokens": 2890679.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 179.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.06739923357963562, "kl": 0.07707533240318298, "learning_rate": 1.7611111111111115e-07, "loss": 0.0034, "num_tokens": 2890992.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 179.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 3.2981579303741455, "kl": 0.13853192329406738, "learning_rate": 1.7555555555555558e-07, "loss": -0.0149, "num_tokens": 2891257.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 9685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 179.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.011736897751688957, "kl": 0.0009472547681070864, "learning_rate": 1.7500000000000002e-07, "loss": 0.0, "num_tokens": 2891492.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 179.38888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 5.703742980957031, "kl": 0.0925840213894844, "learning_rate": 1.7444444444444445e-07, "loss": 0.3598, "num_tokens": 2891726.0, "reward": 3.875, "reward_std": 0.25, "rewards/reward_combined/mean": 3.875, "rewards/reward_combined/std": 0.25, "step": 9687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 179.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.045191485434770584, "kl": 0.0006208717823028564, "learning_rate": 1.738888888888889e-07, "loss": 0.0, "num_tokens": 2891974.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 179.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.8148630261421204, "kl": 0.12562632374465466, "learning_rate": 1.7333333333333335e-07, "loss": 0.0066, "num_tokens": 2892303.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 179.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.034694742411375046, "kl": 0.04023018851876259, "learning_rate": 1.7277777777777779e-07, "loss": 0.002, "num_tokens": 2892596.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 179.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0519324466586113, "kl": 0.01973155466839671, "learning_rate": 1.7222222222222225e-07, "loss": 0.001, "num_tokens": 2892884.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 179.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.7747874855995178, "kl": 0.2633269131183624, "learning_rate": 1.7166666666666668e-07, "loss": 0.0125, "num_tokens": 2893208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 179.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.02346043288707733, "kl": 0.005369147984310985, "learning_rate": 1.7111111111111114e-07, "loss": 0.0003, "num_tokens": 2893515.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 179.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.006337060127407312, "kl": 0.21787550300359726, "learning_rate": 1.7055555555555558e-07, "loss": 0.0109, "num_tokens": 2893819.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 179.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011354684829711914, "kl": 0.0017224890762008727, "learning_rate": 1.7000000000000001e-07, "loss": 0.0001, "num_tokens": 2894099.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 179.55555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 2.836568593978882, "kl": 0.07392894476652145, "learning_rate": 1.6944444444444448e-07, "loss": 0.214, "num_tokens": 2894473.0, "reward": 6.5, "reward_std": 2.0, "rewards/reward_combined/mean": 6.5, "rewards/reward_combined/std": 2.0, "step": 9696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 179.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 6.4741973876953125, "kl": 0.1526840478181839, "learning_rate": 1.6888888888888888e-07, "loss": 0.1397, "num_tokens": 2894768.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 179.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.016909390687942505, "kl": 0.001921720802783966, "learning_rate": 1.6833333333333335e-07, "loss": 0.0001, "num_tokens": 2894974.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 179.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.2791280746459961, "kl": 0.09980425238609314, "learning_rate": 1.6777777777777778e-07, "loss": 0.0055, "num_tokens": 2895279.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.75, "completions/mean_terminated_length": 21.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 179.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.03400160372257233, "kl": 0.009720368310809135, "learning_rate": 1.6722222222222224e-07, "loss": 0.0005, "num_tokens": 2895610.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.008928571827709675, "clip_ratio/region_mean": 0.008928571827709675, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 6.0707292556762695, "kl": 0.018919361755251884, "learning_rate": 1.6666666666666668e-07, "loss": 0.2814, "num_tokens": 2895901.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 179.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.2989466190338135, "kl": 0.20262475311756134, "learning_rate": 1.6611111111111114e-07, "loss": 0.0101, "num_tokens": 2896208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 179.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.06772137433290482, "kl": 0.0031875151325948536, "learning_rate": 1.6555555555555557e-07, "loss": 0.0002, "num_tokens": 2896430.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 179.7037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.8483781814575195, "kl": 0.035027374513447285, "learning_rate": 1.65e-07, "loss": -0.0245, "num_tokens": 2896755.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 179.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.03928719088435173, "kl": 0.07140007242560387, "learning_rate": 1.6444444444444447e-07, "loss": 0.0035, "num_tokens": 2897116.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 179.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03402471914887428, "kl": 0.00854471605271101, "learning_rate": 1.638888888888889e-07, "loss": 0.0004, "num_tokens": 2897449.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 52.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 179.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.37931758165359497, "kl": 0.06828657537698746, "learning_rate": 1.6333333333333334e-07, "loss": 0.0035, "num_tokens": 2897937.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 179.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.06577008962631226, "kl": 0.005605178885161877, "learning_rate": 1.6277777777777778e-07, "loss": 0.0003, "num_tokens": 2898241.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 179.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.013019861653447151, "kl": 0.0011187606141902506, "learning_rate": 1.6222222222222224e-07, "loss": 0.0001, "num_tokens": 2898555.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 179.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.016076555475592613, "kl": 0.0018077714485116303, "learning_rate": 1.6166666666666667e-07, "loss": 0.0001, "num_tokens": 2898877.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 179.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.11245217174291611, "kl": 0.012357788626104593, "learning_rate": 1.6111111111111113e-07, "loss": 0.0006, "num_tokens": 2899169.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 179.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.1796469688415527, "kl": 0.11051980406045914, "learning_rate": 1.6055555555555557e-07, "loss": -0.0326, "num_tokens": 2899548.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 179.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.007561523467302322, "kl": 0.0017141103744506836, "learning_rate": 1.6e-07, "loss": 0.0001, "num_tokens": 2899760.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 179.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.11133704334497452, "kl": 0.017476930283010006, "learning_rate": 1.5944444444444446e-07, "loss": 0.0009, "num_tokens": 2900032.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 179.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 7.158605876611546e-05, "kl": 5.81890344619751e-06, "learning_rate": 1.588888888888889e-07, "loss": 0.0, "num_tokens": 2900252.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 179.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03256369009613991, "kl": 0.0024678021436557174, "learning_rate": 1.5833333333333336e-07, "loss": 0.0001, "num_tokens": 2900514.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 179.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0149840097874403, "kl": 0.07162009552121162, "learning_rate": 1.577777777777778e-07, "loss": 0.0036, "num_tokens": 2900955.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 179.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.17921394109725952, "kl": 0.05332516320049763, "learning_rate": 1.5722222222222223e-07, "loss": 0.0027, "num_tokens": 2901312.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 179.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 1.5145152807235718, "kl": 0.24231421202421188, "learning_rate": 1.5666666666666667e-07, "loss": 0.0135, "num_tokens": 2901599.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9719 }, { "clip_ratio/high_max": 0.01515151560306549, "clip_ratio/high_mean": 0.01515151560306549, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01515151560306549, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 180.0, "frac_reward_zero_std": 0.0, "grad_norm": 25.39703369140625, "kl": 4.90724977850914, "learning_rate": 1.5611111111111113e-07, "loss": 0.1919, "num_tokens": 2901886.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 180.0185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.520554542541504, "kl": 0.2424719185801223, "learning_rate": 1.5555555555555556e-07, "loss": 0.0719, "num_tokens": 2902209.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 180.03703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.5434160232543945, "kl": 0.036626370158046484, "learning_rate": 1.5500000000000002e-07, "loss": -0.144, "num_tokens": 2902553.0, "reward": 7.625, "reward_std": 0.25, "rewards/reward_combined/mean": 7.625, "rewards/reward_combined/std": 0.25, "step": 9722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 180.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03524451330304146, "kl": 0.18008841574192047, "learning_rate": 1.5444444444444446e-07, "loss": 0.009, "num_tokens": 2902861.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 180.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.04417319595813751, "kl": 0.007573556154966354, "learning_rate": 1.538888888888889e-07, "loss": 0.0004, "num_tokens": 2903141.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 180.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.04407687857747078, "kl": 0.06983573734760284, "learning_rate": 1.5333333333333333e-07, "loss": 0.0035, "num_tokens": 2903514.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 180.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.013051359914243221, "kl": 0.002419057476799935, "learning_rate": 1.527777777777778e-07, "loss": 0.0001, "num_tokens": 2903770.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.040486156940460205, "kl": 0.000784234725870192, "learning_rate": 1.5222222222222223e-07, "loss": 0.0, "num_tokens": 2903984.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.014954215846955776, "kl": 0.0005293756839819252, "learning_rate": 1.516666666666667e-07, "loss": 0.0, "num_tokens": 2904238.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 180.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.09487248957157135, "kl": 0.027451996691524982, "learning_rate": 1.5111111111111112e-07, "loss": 0.0014, "num_tokens": 2904571.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 180.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.028240879997611046, "kl": 0.01074343640357256, "learning_rate": 1.5055555555555558e-07, "loss": 0.0005, "num_tokens": 2904885.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 180.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.08463919162750244, "kl": 0.004856997635215521, "learning_rate": 1.5000000000000002e-07, "loss": 0.0002, "num_tokens": 2905143.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 180.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.02030097134411335, "kl": 0.010083612985908985, "learning_rate": 1.4944444444444445e-07, "loss": 0.0005, "num_tokens": 2905414.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 12.116594314575195, "kl": 0.04503246024250984, "learning_rate": 1.488888888888889e-07, "loss": 0.0618, "num_tokens": 2905651.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 180.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.007540071848779917, "kl": 0.0017801672220230103, "learning_rate": 1.4833333333333335e-07, "loss": 0.0001, "num_tokens": 2905863.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 180.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.09919171780347824, "kl": 0.27916090190410614, "learning_rate": 1.4777777777777779e-07, "loss": 0.0139, "num_tokens": 2906153.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 180.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02901582419872284, "kl": 0.015136186499148607, "learning_rate": 1.4722222222222222e-07, "loss": 0.0006, "num_tokens": 2906511.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 180.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.037928808480501175, "kl": 0.030100403353571892, "learning_rate": 1.4666666666666668e-07, "loss": 0.0015, "num_tokens": 2906802.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 180.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 4.718386173248291, "kl": 0.011842335341498256, "learning_rate": 1.4611111111111112e-07, "loss": 0.4088, "num_tokens": 2907098.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 180.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.14805571734905243, "kl": 0.04075881280004978, "learning_rate": 1.4555555555555558e-07, "loss": 0.0021, "num_tokens": 2907413.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 8.87792682647705, "kl": 0.19895683228969574, "learning_rate": 1.4500000000000001e-07, "loss": 0.2966, "num_tokens": 2907639.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 9740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 180.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.010943112894892693, "kl": 0.003345984034240246, "learning_rate": 1.4444444444444445e-07, "loss": 0.0002, "num_tokens": 2907941.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 180.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.12807729840278625, "kl": 0.03376125812064856, "learning_rate": 1.4388888888888888e-07, "loss": 0.0018, "num_tokens": 2908239.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 180.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 6.31257963180542, "kl": 0.2153877466917038, "learning_rate": 1.4333333333333335e-07, "loss": -0.0677, "num_tokens": 2908551.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 180.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02042970061302185, "kl": 0.004390032030642033, "learning_rate": 1.4277777777777778e-07, "loss": 0.0002, "num_tokens": 2908843.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 180.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.07405776530504227, "kl": 0.1436452865600586, "learning_rate": 1.4222222222222224e-07, "loss": 0.0072, "num_tokens": 2909185.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 180.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011015470139682293, "kl": 0.0027338294312357903, "learning_rate": 1.4166666666666668e-07, "loss": 0.0001, "num_tokens": 2909467.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 59.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 59.0, "completions/mean_terminated_length": 59.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 180.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.7518354654312134, "kl": 0.06964358687400818, "learning_rate": 1.411111111111111e-07, "loss": 0.0769, "num_tokens": 2909927.0, "reward": 2.25, "reward_std": 3.5, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.5, "step": 9747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 180.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.09466277062892914, "kl": 0.0438572823186405, "learning_rate": 1.4055555555555557e-07, "loss": 0.002, "num_tokens": 2910222.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 180.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.020802149549126625, "kl": 0.006268077529966831, "learning_rate": 1.4e-07, "loss": 0.0003, "num_tokens": 2910510.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 180.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.3075803518295288, "kl": 0.14596831798553467, "learning_rate": 1.3944444444444447e-07, "loss": 0.007, "num_tokens": 2910947.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9750 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 180.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 4.4457173347473145, "kl": 0.0789904254488647, "learning_rate": 1.3888888888888888e-07, "loss": 0.1399, "num_tokens": 2911280.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 180.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.014267906546592712, "kl": 0.0011972524225711823, "learning_rate": 1.3833333333333334e-07, "loss": 0.0001, "num_tokens": 2911524.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 180.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.025781402364373207, "kl": 0.44462308287620544, "learning_rate": 1.3777777777777778e-07, "loss": 0.0222, "num_tokens": 2911808.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 180.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.022632833570241928, "kl": 0.0022710522171109915, "learning_rate": 1.3722222222222224e-07, "loss": 0.0001, "num_tokens": 2912080.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 180.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.008406490087509155, "kl": 0.23921876400709152, "learning_rate": 1.3666666666666667e-07, "loss": 0.0119, "num_tokens": 2912380.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0166275966912508, "kl": 0.0005794644239358604, "learning_rate": 1.361111111111111e-07, "loss": 0.0, "num_tokens": 2912598.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 180.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.20459598302841187, "kl": 0.044304756447672844, "learning_rate": 1.3555555555555557e-07, "loss": 0.0024, "num_tokens": 2912933.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 180.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.031117068603634834, "kl": 0.004661833634600043, "learning_rate": 1.35e-07, "loss": 0.0002, "num_tokens": 2913265.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 180.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.07091022282838821, "kl": 0.11830427870154381, "learning_rate": 1.3444444444444447e-07, "loss": 0.006, "num_tokens": 2913591.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 180.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.047296952456235886, "kl": 0.0127630066126585, "learning_rate": 1.338888888888889e-07, "loss": 0.0006, "num_tokens": 2913869.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 180.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.02569713070988655, "kl": 0.0036413968191482127, "learning_rate": 1.3333333333333336e-07, "loss": 0.0002, "num_tokens": 2914178.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 180.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0412740483880043, "kl": 0.017283685505390167, "learning_rate": 1.3277777777777777e-07, "loss": 0.0009, "num_tokens": 2914472.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 180.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009824562584981322, "kl": 0.0001660019188420847, "learning_rate": 1.3222222222222223e-07, "loss": 0.0, "num_tokens": 2914728.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.25, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 180.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.282123327255249, "kl": 0.07836546376347542, "learning_rate": 1.3166666666666667e-07, "loss": 0.0092, "num_tokens": 2915193.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 9764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 180.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 2.8359663486480713, "kl": 0.46052441745996475, "learning_rate": 1.3111111111111113e-07, "loss": 0.0271, "num_tokens": 2915511.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 180.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.2913880348205566, "kl": 0.09835116192698479, "learning_rate": 1.3055555555555556e-07, "loss": -0.1147, "num_tokens": 2915889.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 180.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.035389967262744904, "kl": 0.01919572986662388, "learning_rate": 1.3e-07, "loss": 0.001, "num_tokens": 2916157.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 180.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.020163260400295258, "kl": 0.0034240782260894775, "learning_rate": 1.2944444444444446e-07, "loss": 0.0002, "num_tokens": 2916417.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 180.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 4.090687434654683e-05, "kl": 5.27501106262207e-06, "learning_rate": 1.288888888888889e-07, "loss": 0.0, "num_tokens": 2916637.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 180.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.09768522530794144, "kl": 0.01795843499712646, "learning_rate": 1.2833333333333336e-07, "loss": 0.001, "num_tokens": 2916965.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 3.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 180.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03915262222290039, "kl": 0.003767728805541992, "learning_rate": 1.277777777777778e-07, "loss": 0.0002, "num_tokens": 2917175.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 180.96296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 7.61048698425293, "kl": 0.033652343903668225, "learning_rate": 1.2722222222222223e-07, "loss": 0.0718, "num_tokens": 2917459.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 180.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.04390448331832886, "kl": 0.0021545656491070986, "learning_rate": 1.2666666666666666e-07, "loss": 0.0001, "num_tokens": 2917723.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 181.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.2598240375518799, "kl": 0.8365960847586393, "learning_rate": 1.2611111111111112e-07, "loss": 0.0691, "num_tokens": 2917984.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.75, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 7.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 181.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.030558476224541664, "kl": 0.0017347319517284632, "learning_rate": 1.2555555555555556e-07, "loss": 0.0001, "num_tokens": 2918227.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 181.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.021553361788392067, "kl": 0.004857918247580528, "learning_rate": 1.2500000000000002e-07, "loss": 0.0002, "num_tokens": 2918526.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.05555555555554, "frac_reward_zero_std": 0.0, "grad_norm": 1.923479437828064, "kl": 1.3132778853178024, "learning_rate": 1.2444444444444446e-07, "loss": 0.1717, "num_tokens": 2918764.0, "reward": 2.5, "reward_std": 3.0, "rewards/reward_combined/mean": 2.5, "rewards/reward_combined/std": 3.0, "step": 9777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 181.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.03813613951206207, "kl": 0.004398650955408812, "learning_rate": 1.238888888888889e-07, "loss": 0.0002, "num_tokens": 2919050.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 181.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 4.285491466522217, "kl": 0.398979589343071, "learning_rate": 1.2333333333333335e-07, "loss": -0.077, "num_tokens": 2919364.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 181.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.2401055544614792, "kl": 0.07670835964381695, "learning_rate": 1.227777777777778e-07, "loss": 0.004, "num_tokens": 2919665.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 181.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.15781404078006744, "kl": 0.03063128236681223, "learning_rate": 1.2222222222222225e-07, "loss": 0.0017, "num_tokens": 2920007.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 181.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03344513103365898, "kl": 0.0028673249762505293, "learning_rate": 1.2166666666666666e-07, "loss": 0.0001, "num_tokens": 2920267.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 181.16666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 6.114517688751221, "kl": 0.05226296279579401, "learning_rate": 1.2111111111111112e-07, "loss": 0.2277, "num_tokens": 2920551.0, "reward": 6.375, "reward_std": 2.25, "rewards/reward_combined/mean": 6.375, "rewards/reward_combined/std": 2.25, "step": 9783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.013344379141926765, "kl": 0.0003841668367385864, "learning_rate": 1.2055555555555555e-07, "loss": 0.0, "num_tokens": 2920763.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 181.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03445272147655487, "kl": 0.004858701955527067, "learning_rate": 1.2000000000000002e-07, "loss": 0.0002, "num_tokens": 2921027.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 181.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.05025097727775574, "kl": 0.15927499532699585, "learning_rate": 1.1944444444444445e-07, "loss": 0.008, "num_tokens": 2921334.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9786 }, { "clip_ratio/high_max": 0.0117647061124444, "clip_ratio/high_mean": 0.0117647061124444, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0117647061124444, "completion_length": 53.75, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 181.24074074074073, "frac_reward_zero_std": 0.0, "grad_norm": 2.0574417114257812, "kl": 0.18575207889080048, "learning_rate": 1.1888888888888889e-07, "loss": 0.2606, "num_tokens": 2921801.0, "reward": 5.625, "reward_std": 4.422951698303223, "rewards/reward_combined/mean": 5.625, "rewards/reward_combined/std": 4.422951698303223, "step": 9787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 4.8623922339174896e-05, "kl": 4.954636096954346e-06, "learning_rate": 1.1833333333333333e-07, "loss": 0.0, "num_tokens": 2922021.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 181.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.008636854588985443, "kl": 0.2390918955206871, "learning_rate": 1.1777777777777778e-07, "loss": 0.012, "num_tokens": 2922321.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.03561604395508766, "kl": 0.02391472738236189, "learning_rate": 1.1722222222222223e-07, "loss": 0.0012, "num_tokens": 2922547.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 181.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 3.5518133640289307, "kl": 0.07952412962913513, "learning_rate": 1.1666666666666668e-07, "loss": -0.0155, "num_tokens": 2922826.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 181.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.4866299629211426, "kl": 0.2833915762603283, "learning_rate": 1.1611111111111111e-07, "loss": 0.0162, "num_tokens": 2923140.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 9792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 181.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00483608478680253, "kl": 0.00014034211926627904, "learning_rate": 1.1555555555555556e-07, "loss": 0.0, "num_tokens": 2923396.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.06280596554279327, "kl": 0.010638289342750795, "learning_rate": 1.1500000000000001e-07, "loss": 0.0008, "num_tokens": 2923652.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 181.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.2461770623922348, "kl": 0.040850822580978274, "learning_rate": 1.1444444444444446e-07, "loss": 0.0022, "num_tokens": 2923958.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.25, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 25.25, "completions/mean_terminated_length": 25.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 181.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.2848750352859497, "kl": 0.07310908287763596, "learning_rate": 1.1388888888888891e-07, "loss": 0.0033, "num_tokens": 2924307.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 181.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.04726504534482956, "kl": 0.005148933967575431, "learning_rate": 1.1333333333333336e-07, "loss": 0.0003, "num_tokens": 2924632.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 181.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.014271682128310204, "kl": 0.08506487309932709, "learning_rate": 1.1277777777777778e-07, "loss": 0.004, "num_tokens": 2925056.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 181.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.03591519221663475, "kl": 0.017723617143929005, "learning_rate": 1.1222222222222223e-07, "loss": 0.0009, "num_tokens": 2925345.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 181.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.020663727074861526, "kl": 0.006381620187312365, "learning_rate": 1.1166666666666667e-07, "loss": 0.0003, "num_tokens": 2925633.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 181.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.028828898444771767, "kl": 0.011078543029725552, "learning_rate": 1.1111111111111112e-07, "loss": 0.0006, "num_tokens": 2925971.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 181.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0949200838804245, "kl": 0.05698666628450155, "learning_rate": 1.1055555555555557e-07, "loss": 0.0028, "num_tokens": 2926246.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 181.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.008343631401658058, "kl": 0.008026790805161, "learning_rate": 1.1e-07, "loss": 0.0004, "num_tokens": 2926558.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 181.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.03842431679368019, "kl": 0.0037712459452450275, "learning_rate": 1.0944444444444445e-07, "loss": 0.0002, "num_tokens": 2926867.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 53.75, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 181.57407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 1.7589374780654907, "kl": 0.05093527212738991, "learning_rate": 1.088888888888889e-07, "loss": 0.0964, "num_tokens": 2927362.0, "reward": 1.75, "reward_std": 1.443375587463379, "rewards/reward_combined/mean": 1.75, "rewards/reward_combined/std": 1.4433757066726685, "step": 9805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 181.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.027195487171411514, "kl": 0.4445320963859558, "learning_rate": 1.0833333333333335e-07, "loss": 0.0222, "num_tokens": 2927646.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 181.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.030090827494859695, "kl": 0.0032484307885169983, "learning_rate": 1.077777777777778e-07, "loss": 0.0002, "num_tokens": 2927852.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 181.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.04916427284479141, "kl": 0.011466407217085361, "learning_rate": 1.0722222222222222e-07, "loss": 0.0006, "num_tokens": 2928132.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 181.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.59682297706604, "kl": 0.19521822035312653, "learning_rate": 1.0666666666666667e-07, "loss": 0.0239, "num_tokens": 2928450.0, "reward": 5.5, "reward_std": 5.0, "rewards/reward_combined/mean": 5.5, "rewards/reward_combined/std": 5.0, "step": 9809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 181.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.6702651977539062, "kl": 0.13590765744447708, "learning_rate": 1.0611111111111112e-07, "loss": 0.1586, "num_tokens": 2928821.0, "reward": 5.375, "reward_std": 2.462214469909668, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 2.462214469909668, "step": 9810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 181.6851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.496537208557129, "kl": 0.26060161273926497, "learning_rate": 1.0555555555555557e-07, "loss": 0.0651, "num_tokens": 2929142.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 6.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 181.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.024220576509833336, "kl": 0.0011109120678156614, "learning_rate": 1.0500000000000001e-07, "loss": 0.0001, "num_tokens": 2929376.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 181.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.014795628376305103, "kl": 0.031088726595044136, "learning_rate": 1.0444444444444445e-07, "loss": 0.0016, "num_tokens": 2929670.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 181.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076874918304383755, "kl": 0.0016897916793823242, "learning_rate": 1.038888888888889e-07, "loss": 0.0001, "num_tokens": 2929882.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 181.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04391772300004959, "kl": 0.002835989696905017, "learning_rate": 1.0333333333333335e-07, "loss": 0.0001, "num_tokens": 2930149.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 181.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.03267636522650719, "kl": 0.01466370327398181, "learning_rate": 1.027777777777778e-07, "loss": 0.0007, "num_tokens": 2930442.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.25, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 181.7962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 2.6079792976379395, "kl": 0.026860635727643967, "learning_rate": 1.0222222222222224e-07, "loss": -0.0301, "num_tokens": 2930835.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 181.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.05670510232448578, "kl": 0.009110169485211372, "learning_rate": 1.0166666666666666e-07, "loss": 0.0005, "num_tokens": 2931164.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 181.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.016828374937176704, "kl": 0.008870851248502731, "learning_rate": 1.0111111111111111e-07, "loss": 0.0004, "num_tokens": 2931491.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 181.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.023340687155723572, "kl": 0.003553185611963272, "learning_rate": 1.0055555555555556e-07, "loss": 0.0002, "num_tokens": 2931751.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 181.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.23817844688892365, "kl": 0.12443385273218155, "learning_rate": 1.0000000000000001e-07, "loss": 0.0062, "num_tokens": 2932123.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 181.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.04062029346823692, "kl": 0.01253275154158473, "learning_rate": 9.944444444444446e-08, "loss": 0.0006, "num_tokens": 2932397.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.75, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 181.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 2.5689172744750977, "kl": 0.14506731927394867, "learning_rate": 9.888888888888889e-08, "loss": 0.175, "num_tokens": 2932800.0, "reward": 2.25, "reward_std": 3.947572946548462, "rewards/reward_combined/mean": 2.25, "rewards/reward_combined/std": 3.947573184967041, "step": 9823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 181.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.02726629562675953, "kl": 0.006787734106183052, "learning_rate": 9.833333333333334e-08, "loss": 0.0003, "num_tokens": 2933073.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 181.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.1313728392124176, "kl": 0.009340956748928875, "learning_rate": 9.777777777777779e-08, "loss": 0.0005, "num_tokens": 2933371.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 181.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.061752788722515106, "kl": 0.0036607086658477783, "learning_rate": 9.722222222222224e-08, "loss": 0.0002, "num_tokens": 2933589.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9826 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 181.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.3709022998809814, "kl": 0.7940510287880898, "learning_rate": 9.666666666666669e-08, "loss": 0.0209, "num_tokens": 2933888.0, "reward": 6.125, "reward_std": 3.75, "rewards/reward_combined/mean": 6.125, "rewards/reward_combined/std": 3.75, "step": 9827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 182.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.0322665199637413, "kl": 0.012568796053528786, "learning_rate": 9.611111111111113e-08, "loss": 0.0006, "num_tokens": 2934149.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 182.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01569255255162716, "kl": 0.0009101835021283478, "learning_rate": 9.555555555555556e-08, "loss": 0.0, "num_tokens": 2934382.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 182.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.022658785805106163, "kl": 0.011900406796485186, "learning_rate": 9.5e-08, "loss": 0.0006, "num_tokens": 2934673.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 182.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.004649420268833637, "kl": 0.0006609141710214317, "learning_rate": 9.444444444444445e-08, "loss": 0.0, "num_tokens": 2934893.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 182.07407407407408, "frac_reward_zero_std": 0.0, "grad_norm": 2.622938871383667, "kl": 0.11330960690975189, "learning_rate": 9.38888888888889e-08, "loss": 0.1109, "num_tokens": 2935250.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.06593188643455505, "kl": 0.02499358355998993, "learning_rate": 9.333333333333335e-08, "loss": 0.0015, "num_tokens": 2935553.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 182.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.04852103814482689, "kl": 0.004814498592168093, "learning_rate": 9.277777777777778e-08, "loss": 0.0004, "num_tokens": 2935782.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 182.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 5.609692573547363, "kl": 0.5564927607774734, "learning_rate": 9.222222222222223e-08, "loss": 0.0595, "num_tokens": 2936068.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 182.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03804126754403114, "kl": 0.0007749914802843705, "learning_rate": 9.166666666666668e-08, "loss": 0.0, "num_tokens": 2936324.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 182.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.04515055939555168, "kl": 0.012442635837942362, "learning_rate": 9.111111111111113e-08, "loss": 0.0006, "num_tokens": 2936657.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 182.1851851851852, "frac_reward_zero_std": 0.0, "grad_norm": 1.6716152429580688, "kl": 0.142278503626585, "learning_rate": 9.055555555555558e-08, "loss": -0.145, "num_tokens": 2937038.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.5, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 182.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.13959850370883942, "kl": 0.0349765894934535, "learning_rate": 9e-08, "loss": 0.0018, "num_tokens": 2937448.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 182.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.009779654443264008, "kl": 0.0415913462638855, "learning_rate": 8.944444444444445e-08, "loss": 0.0021, "num_tokens": 2937924.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.048366136848926544, "kl": 0.005801718682050705, "learning_rate": 8.88888888888889e-08, "loss": 0.0003, "num_tokens": 2938208.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 182.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 2.192692756652832, "kl": 0.33329927548766136, "learning_rate": 8.833333333333334e-08, "loss": 0.019, "num_tokens": 2938537.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.01904347911477089, "kl": 0.027184002101421356, "learning_rate": 8.777777777777779e-08, "loss": 0.0014, "num_tokens": 2938753.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0483064278960228, "kl": 0.002520780311897397, "learning_rate": 8.722222222222223e-08, "loss": 0.0001, "num_tokens": 2939013.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 182.3148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 6.446226596832275, "kl": 0.19954543560743332, "learning_rate": 8.666666666666668e-08, "loss": -0.0493, "num_tokens": 2939316.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 182.33333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.014107439666986465, "kl": 0.003423508256673813, "learning_rate": 8.611111111111112e-08, "loss": 0.0002, "num_tokens": 2939576.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 182.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025797097478061914, "kl": 0.2821517288684845, "learning_rate": 8.555555555555557e-08, "loss": 0.0141, "num_tokens": 2939864.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 80.25, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 80.25, "completions/mean_terminated_length": 21.666667938232422, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 182.37037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 2.3509626388549805, "kl": 0.15860457718372345, "learning_rate": 8.500000000000001e-08, "loss": 0.5505, "num_tokens": 2940405.0, "reward": 4.300000190734863, "reward_std": 4.284857273101807, "rewards/reward_combined/mean": 4.300000190734863, "rewards/reward_combined/std": 4.284857273101807, "step": 9848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 182.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.02226998284459114, "kl": 0.01366675877943635, "learning_rate": 8.444444444444444e-08, "loss": 0.0007, "num_tokens": 2940665.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 182.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.012879644520580769, "kl": 0.0010179979726672173, "learning_rate": 8.388888888888889e-08, "loss": 0.0001, "num_tokens": 2940979.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 182.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.036241237074136734, "kl": 0.17460887879133224, "learning_rate": 8.333333333333334e-08, "loss": 0.0087, "num_tokens": 2941288.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 182.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.014574444852769375, "kl": 0.030589360743761063, "learning_rate": 8.277777777777779e-08, "loss": 0.0015, "num_tokens": 2941582.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 30.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 182.46296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.7251883745193481, "kl": 0.3430441990494728, "learning_rate": 8.222222222222223e-08, "loss": 0.0285, "num_tokens": 2941933.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02996114455163479, "kl": 0.0009636759641580284, "learning_rate": 8.166666666666667e-08, "loss": 0.0, "num_tokens": 2942181.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 182.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0911908820271492, "kl": 0.007259864592924714, "learning_rate": 8.111111111111112e-08, "loss": 0.0004, "num_tokens": 2942473.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 182.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009082421893253922, "kl": 0.0017090716282837093, "learning_rate": 8.055555555555557e-08, "loss": 0.0001, "num_tokens": 2942753.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 182.53703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 2.5356855392456055, "kl": 0.061611708253622055, "learning_rate": 8e-08, "loss": 0.0532, "num_tokens": 2943121.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 182.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.3356877267360687, "kl": 0.06433466728776693, "learning_rate": 7.944444444444445e-08, "loss": 0.0034, "num_tokens": 2943397.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 182.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.3290867507457733, "kl": 0.055178672075271606, "learning_rate": 7.88888888888889e-08, "loss": 0.0028, "num_tokens": 2943691.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 182.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 2.522357225418091, "kl": 0.02797626657411456, "learning_rate": 7.833333333333333e-08, "loss": 0.0072, "num_tokens": 2943988.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 182.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00894313957542181, "kl": 0.004157167510129511, "learning_rate": 7.777777777777778e-08, "loss": 0.0002, "num_tokens": 2944252.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 182.62962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 1.530543565750122, "kl": 0.11779679544270039, "learning_rate": 7.722222222222223e-08, "loss": 0.0058, "num_tokens": 2944560.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.25, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 182.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05378071591258049, "kl": 0.019460704177618027, "learning_rate": 7.666666666666666e-08, "loss": 0.001, "num_tokens": 2944805.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 182.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.012400304898619652, "kl": 0.002723056124523282, "learning_rate": 7.611111111111111e-08, "loss": 0.0001, "num_tokens": 2945059.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 182.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.36974963545799255, "kl": 0.1216057245619595, "learning_rate": 7.555555555555556e-08, "loss": 0.0051, "num_tokens": 2945382.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 182.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.279241681098938, "kl": 0.057956137927249074, "learning_rate": 7.500000000000001e-08, "loss": 0.003, "num_tokens": 2945683.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.006704759784042835, "kl": 0.0005097910761833191, "learning_rate": 7.444444444444444e-08, "loss": 0.0, "num_tokens": 2945895.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.10191572457551956, "kl": 0.021661514416337013, "learning_rate": 7.388888888888889e-08, "loss": 0.0011, "num_tokens": 2946169.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 182.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.07798439264297485, "kl": 0.014733481220901012, "learning_rate": 7.333333333333334e-08, "loss": 0.0007, "num_tokens": 2946503.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 5.699916437151842e-05, "kl": 5.349516868591309e-06, "learning_rate": 7.277777777777779e-08, "loss": 0.0, "num_tokens": 2946723.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 182.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 1.1710656881332397, "kl": 0.2957869619131088, "learning_rate": 7.222222222222222e-08, "loss": 0.0156, "num_tokens": 2947037.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.75, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 38.75, "completions/mean_terminated_length": 38.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 182.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.16561022400856018, "kl": 0.10840902477502823, "learning_rate": 7.166666666666667e-08, "loss": 0.0049, "num_tokens": 2947460.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 182.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.008319610729813576, "kl": 0.23918427526950836, "learning_rate": 7.111111111111112e-08, "loss": 0.0119, "num_tokens": 2947760.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 182.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.036412496119737625, "kl": 0.01605005469173193, "learning_rate": 7.055555555555556e-08, "loss": 0.0008, "num_tokens": 2948092.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 64.0, "completions/mean_terminated_length": 64.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 182.87037037037038, "frac_reward_zero_std": 0.0, "grad_norm": 1.5728391408920288, "kl": 0.13312770426273346, "learning_rate": 7e-08, "loss": 0.3725, "num_tokens": 2948584.0, "reward": 5.375, "reward_std": 4.25, "rewards/reward_combined/mean": 5.375, "rewards/reward_combined/std": 4.25, "step": 9875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 182.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.14100678265094757, "kl": 0.024264410138130188, "learning_rate": 6.944444444444444e-08, "loss": 0.0015, "num_tokens": 2948872.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 182.90740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.024726906791329384, "kl": 0.009904567152261734, "learning_rate": 6.888888888888889e-08, "loss": 0.0005, "num_tokens": 2949184.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 182.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.069968082010746, "kl": 0.03903050068765879, "learning_rate": 6.833333333333334e-08, "loss": 0.0021, "num_tokens": 2949456.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 182.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.011741222813725471, "kl": 0.008235849440097809, "learning_rate": 6.777777777777778e-08, "loss": 0.0004, "num_tokens": 2949692.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 182.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.011811763048171997, "kl": 0.005391545593738556, "learning_rate": 6.722222222222223e-08, "loss": 0.0003, "num_tokens": 2949900.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 182.9814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.1384880542755127, "kl": 0.02729683741927147, "learning_rate": 6.666666666666668e-08, "loss": 0.0788, "num_tokens": 2950224.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 183.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.08610053360462189, "kl": 0.017179434187710285, "learning_rate": 6.611111111111112e-08, "loss": 0.0009, "num_tokens": 2950509.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 183.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.020119914785027504, "kl": 0.04600493051111698, "learning_rate": 6.555555555555556e-08, "loss": 0.0023, "num_tokens": 2950969.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.00325769349001348, "kl": 0.00021807849407196045, "learning_rate": 6.5e-08, "loss": 0.0, "num_tokens": 2951181.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 183.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.3702092468738556, "kl": 0.05568135902285576, "learning_rate": 6.444444444444445e-08, "loss": 0.0036, "num_tokens": 2951474.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 183.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.032683469355106354, "kl": 0.0015753918560221791, "learning_rate": 6.38888888888889e-08, "loss": 0.0001, "num_tokens": 2951709.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.031234368681907654, "kl": 0.0015914947143755853, "learning_rate": 6.333333333333333e-08, "loss": 0.0001, "num_tokens": 2951963.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.75, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.75, "completions/mean_terminated_length": 26.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 183.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.05280414968729019, "kl": 0.08759231492877007, "learning_rate": 6.277777777777778e-08, "loss": 0.0043, "num_tokens": 2952306.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 183.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.08090963214635849, "kl": 0.043560491409152746, "learning_rate": 6.222222222222223e-08, "loss": 0.0022, "num_tokens": 2952581.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 183.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02677622064948082, "kl": 0.0033671462442725897, "learning_rate": 6.166666666666668e-08, "loss": 0.0002, "num_tokens": 2952879.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 183.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.13071893155574799, "kl": 0.02933331672102213, "learning_rate": 6.111111111111112e-08, "loss": 0.0015, "num_tokens": 2953199.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 183.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.08291850239038467, "kl": 0.027653749100863934, "learning_rate": 6.055555555555556e-08, "loss": 0.0014, "num_tokens": 2953537.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 183.2037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.2538169026374817, "kl": 0.1373359113931656, "learning_rate": 6.000000000000001e-08, "loss": 0.007, "num_tokens": 2953941.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.75, "completions/mean_terminated_length": 3.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 183.22222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 6.890188217163086, "kl": 0.07612712681293488, "learning_rate": 5.944444444444444e-08, "loss": -0.0676, "num_tokens": 2954152.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 183.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.06125115230679512, "kl": 0.008987079374492168, "learning_rate": 5.888888888888889e-08, "loss": 0.0004, "num_tokens": 2954424.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 183.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.027649885043501854, "kl": 0.0036431278567761183, "learning_rate": 5.833333333333334e-08, "loss": 0.0002, "num_tokens": 2954694.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.27777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.25425198674201965, "kl": 0.037411946803331375, "learning_rate": 5.777777777777778e-08, "loss": 0.0026, "num_tokens": 2954938.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 3.650897997431457e-05, "kl": 4.634261131286621e-06, "learning_rate": 5.722222222222223e-08, "loss": 0.0, "num_tokens": 2955158.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 183.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.08385016024112701, "kl": 0.06055685970932245, "learning_rate": 5.666666666666668e-08, "loss": 0.003, "num_tokens": 2955430.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 183.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.5100444555282593, "kl": 0.10248490422964096, "learning_rate": 5.611111111111111e-08, "loss": -0.0648, "num_tokens": 2955790.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 183.35185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 2.613344430923462, "kl": 0.047133805230259895, "learning_rate": 5.555555555555556e-08, "loss": 0.0777, "num_tokens": 2956097.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 183.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.009358160197734833, "kl": 0.002703327452763915, "learning_rate": 5.5e-08, "loss": 0.0001, "num_tokens": 2956381.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 183.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.015386590734124184, "kl": 0.437531054019928, "learning_rate": 5.444444444444445e-08, "loss": 0.0219, "num_tokens": 2956665.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 183.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.006199229974299669, "kl": 0.0057176221162080765, "learning_rate": 5.38888888888889e-08, "loss": 0.0003, "num_tokens": 2956977.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 36.25, "completions/mean_terminated_length": 36.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 183.42592592592592, "frac_reward_zero_std": 0.0, "grad_norm": 3.332332134246826, "kl": 0.0773181039839983, "learning_rate": 5.3333333333333334e-08, "loss": 0.0609, "num_tokens": 2957346.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 183.44444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.2211017608642578, "kl": 0.05059762066230178, "learning_rate": 5.277777777777778e-08, "loss": 0.0025, "num_tokens": 2957627.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 34.0, "completions/mean_terminated_length": 34.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 183.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.08851920068264008, "kl": 0.036650958471000195, "learning_rate": 5.2222222222222224e-08, "loss": 0.0018, "num_tokens": 2957983.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 28.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 183.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.049282256513834, "kl": 0.14714203029870987, "learning_rate": 5.166666666666667e-08, "loss": 0.0074, "num_tokens": 2958320.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 183.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.15151600539684296, "kl": 0.1609562784433365, "learning_rate": 5.111111111111112e-08, "loss": 0.008, "num_tokens": 2958610.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 183.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.10224032402038574, "kl": 0.03363934904336929, "learning_rate": 5.0555555555555556e-08, "loss": 0.0017, "num_tokens": 2958902.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 183.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.08160945773124695, "kl": 0.009141996502876282, "learning_rate": 5.0000000000000004e-08, "loss": 0.0005, "num_tokens": 2959146.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 183.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.00801890343427658, "kl": 0.0010880347690545022, "learning_rate": 4.9444444444444446e-08, "loss": 0.0001, "num_tokens": 2959458.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 183.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05603796988725662, "kl": 0.014406768837943673, "learning_rate": 4.8888888888888894e-08, "loss": 0.0008, "num_tokens": 2959792.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 183.59259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 3.9476754665374756, "kl": 0.0474927865434438, "learning_rate": 4.833333333333334e-08, "loss": 0.0009, "num_tokens": 2960056.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 183.61111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 4.867988109588623, "kl": 0.06095586717128754, "learning_rate": 4.777777777777778e-08, "loss": 0.2852, "num_tokens": 2960337.0, "reward": 3.125, "reward_std": 1.75, "rewards/reward_combined/mean": 3.125, "rewards/reward_combined/std": 1.75, "step": 9915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 183.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.00779526773840189, "kl": 0.2137647494673729, "learning_rate": 4.7222222222222226e-08, "loss": 0.0107, "num_tokens": 2960641.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 183.64814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077168201096355915, "kl": 0.0016192048788070679, "learning_rate": 4.6666666666666674e-08, "loss": 0.0001, "num_tokens": 2960853.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.66666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0056382049806416035, "kl": 0.02686687558889389, "learning_rate": 4.6111111111111116e-08, "loss": 0.0013, "num_tokens": 2961069.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 183.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.002255281200632453, "kl": 0.0013700044364668429, "learning_rate": 4.5555555555555564e-08, "loss": 0.0001, "num_tokens": 2961386.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 183.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.005266693886369467, "kl": 0.00015355348295997828, "learning_rate": 4.5e-08, "loss": 0.0, "num_tokens": 2961642.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 183.72222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.047253381460905075, "kl": 0.014123301953077316, "learning_rate": 4.444444444444445e-08, "loss": 0.0008, "num_tokens": 2961933.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 183.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.012482337653636932, "kl": 0.0026591152418404818, "learning_rate": 4.3888888888888896e-08, "loss": 0.0001, "num_tokens": 2962187.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 14.666666984558105, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 183.75925925925927, "frac_reward_zero_std": 0.0, "grad_norm": 1.1191285848617554, "kl": 0.041818855330348015, "learning_rate": 4.333333333333334e-08, "loss": 0.4493, "num_tokens": 2962719.0, "reward": 5.75, "reward_std": 4.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 4.5, "step": 9923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 183.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.022166237235069275, "kl": 0.013718564994633198, "learning_rate": 4.2777777777777786e-08, "loss": 0.0007, "num_tokens": 2962979.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 183.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009190065320581198, "kl": 0.0017154152737930417, "learning_rate": 4.222222222222222e-08, "loss": 0.0001, "num_tokens": 2963259.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 183.8148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.21353137493133545, "kl": 0.027130769565701485, "learning_rate": 4.166666666666667e-08, "loss": 0.0014, "num_tokens": 2963552.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 42.25, "completions/mean_terminated_length": 42.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 183.83333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.5520133972167969, "kl": 0.23374981433153152, "learning_rate": 4.111111111111112e-08, "loss": 0.3925, "num_tokens": 2963945.0, "reward": 7.5, "reward_std": 1.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 1.0, "step": 9927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.75, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 183.85185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.16192340850830078, "kl": 0.10349265486001968, "learning_rate": 4.055555555555556e-08, "loss": 0.005, "num_tokens": 2964372.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.25, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.25, "completions/mean_terminated_length": 19.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 183.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.16319434344768524, "kl": 0.02126255538314581, "learning_rate": 4e-08, "loss": 0.0013, "num_tokens": 2964693.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 183.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.06398267298936844, "kl": 0.12844476103782654, "learning_rate": 3.944444444444445e-08, "loss": 0.0064, "num_tokens": 2965023.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 183.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.042402744293213, "kl": 0.19247152656316757, "learning_rate": 3.888888888888889e-08, "loss": 0.0009, "num_tokens": 2965346.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 22.75, "completions/mean_terminated_length": 22.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 183.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.31463173031806946, "kl": 0.05178556963801384, "learning_rate": 3.833333333333333e-08, "loss": 0.0029, "num_tokens": 2965689.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.5, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 183.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02465791255235672, "kl": 0.000699816650012508, "learning_rate": 3.777777777777778e-08, "loss": 0.0, "num_tokens": 2965907.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 183.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.02132634073495865, "kl": 0.00299317913595587, "learning_rate": 3.722222222222222e-08, "loss": 0.0001, "num_tokens": 2966195.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 183.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.03998059034347534, "kl": 0.13342126831412315, "learning_rate": 3.666666666666667e-08, "loss": 0.0067, "num_tokens": 2966504.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 184.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.036010004580020905, "kl": 0.011051493929699063, "learning_rate": 3.611111111111111e-08, "loss": 0.0005, "num_tokens": 2966802.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 184.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01069272868335247, "kl": 0.0021762512624263763, "learning_rate": 3.555555555555556e-08, "loss": 0.0001, "num_tokens": 2967060.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 184.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.010712586343288422, "kl": 0.001257583498954773, "learning_rate": 3.5e-08, "loss": 0.0001, "num_tokens": 2967272.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 184.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.024824630469083786, "kl": 0.0003771781921386719, "learning_rate": 3.4444444444444444e-08, "loss": 0.0, "num_tokens": 2967492.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.013262162916362286, "kl": 0.0029720067977905273, "learning_rate": 3.388888888888889e-08, "loss": 0.0001, "num_tokens": 2967776.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 184.09259259259258, "frac_reward_zero_std": 0.0, "grad_norm": 1.3183619976043701, "kl": 0.0902337022125721, "learning_rate": 3.333333333333334e-08, "loss": -0.1554, "num_tokens": 2968161.0, "reward": 7.875, "reward_std": 0.25, "rewards/reward_combined/mean": 7.875, "rewards/reward_combined/std": 0.25, "step": 9941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.25, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 6.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 184.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0145946703851223, "kl": 0.0006252307593967998, "learning_rate": 3.277777777777778e-08, "loss": 0.0, "num_tokens": 2968394.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.12962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.05495724454522133, "kl": 0.004928098293021321, "learning_rate": 3.2222222222222224e-08, "loss": 0.0002, "num_tokens": 2968688.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 184.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07410069555044174, "kl": 0.009976951405405998, "learning_rate": 3.1666666666666666e-08, "loss": 0.0005, "num_tokens": 2968960.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 184.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.021882209926843643, "kl": 0.0009261287050321698, "learning_rate": 3.1111111111111114e-08, "loss": 0.0, "num_tokens": 2969278.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.5, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 184.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.6660093665122986, "kl": 0.11293127480894327, "learning_rate": 3.055555555555556e-08, "loss": 0.005, "num_tokens": 2969624.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 184.2037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 6.132060527801514, "kl": 0.05837882310152054, "learning_rate": 3.0000000000000004e-08, "loss": 0.2211, "num_tokens": 2969882.0, "reward": 2.75, "reward_std": 2.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 2.5, "step": 9947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 184.22222222222223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06025475636124611, "kl": 0.01815013960003853, "learning_rate": 2.9444444444444446e-08, "loss": 0.0009, "num_tokens": 2970210.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 184.24074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.013881431892514229, "kl": 0.0090715317055583, "learning_rate": 2.888888888888889e-08, "loss": 0.0005, "num_tokens": 2970482.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 184.25925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.02500186301767826, "kl": 0.01115627121180296, "learning_rate": 2.833333333333334e-08, "loss": 0.0006, "num_tokens": 2970774.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.75, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 41.75, "completions/mean_terminated_length": 41.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 184.27777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.3290506601333618, "kl": 0.07382176071405411, "learning_rate": 2.777777777777778e-08, "loss": -0.0831, "num_tokens": 2971209.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 184.2962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041139377281069756, "kl": 0.00013863742060493678, "learning_rate": 2.7222222222222226e-08, "loss": 0.0, "num_tokens": 2971465.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 184.3148148148148, "frac_reward_zero_std": 1.0, "grad_norm": 0.1692848652601242, "kl": 0.03584760206285864, "learning_rate": 2.6666666666666667e-08, "loss": 0.0019, "num_tokens": 2971791.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.5, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 184.33333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 1.0789384841918945, "kl": 0.13455419801175594, "learning_rate": 2.6111111111111112e-08, "loss": -0.047, "num_tokens": 2972265.0, "reward": 2.375, "reward_std": 1.25, "rewards/reward_combined/mean": 2.375, "rewards/reward_combined/std": 1.25, "step": 9954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 5.5, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 184.35185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.19299007952213287, "kl": 0.011239625338930637, "learning_rate": 2.555555555555556e-08, "loss": 0.0007, "num_tokens": 2972487.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 184.37037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.13395163416862488, "kl": 0.15070828050374985, "learning_rate": 2.5000000000000002e-08, "loss": 0.0076, "num_tokens": 2972794.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 184.38888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.032740794122219086, "kl": 0.007932401029393077, "learning_rate": 2.4444444444444447e-08, "loss": 0.0004, "num_tokens": 2973112.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 184.40740740740742, "frac_reward_zero_std": 1.0, "grad_norm": 0.041739996522665024, "kl": 0.005151836201548576, "learning_rate": 2.388888888888889e-08, "loss": 0.0003, "num_tokens": 2973376.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 184.42592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.3413764238357544, "kl": 0.20986133813858032, "learning_rate": 2.3333333333333337e-08, "loss": 0.011, "num_tokens": 2973672.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 184.44444444444446, "frac_reward_zero_std": 0.0, "grad_norm": 4.0175580978393555, "kl": 0.1887547057121992, "learning_rate": 2.2777777777777782e-08, "loss": 0.0691, "num_tokens": 2973988.0, "reward": 2.75, "reward_std": 1.5, "rewards/reward_combined/mean": 2.75, "rewards/reward_combined/std": 1.5, "step": 9960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 184.46296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.044031962752342224, "kl": 0.0038799929316155612, "learning_rate": 2.2222222222222224e-08, "loss": 0.0002, "num_tokens": 2974278.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 184.4814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.012155394069850445, "kl": 0.00048207491636276245, "learning_rate": 2.166666666666667e-08, "loss": 0.0, "num_tokens": 2974490.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.75, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 184.5, "frac_reward_zero_std": 0.0, "grad_norm": 3.601901054382324, "kl": 0.04421086981892586, "learning_rate": 2.111111111111111e-08, "loss": 0.2897, "num_tokens": 2974861.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.25, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 9.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 184.5185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0325482003390789, "kl": 0.01269484218209982, "learning_rate": 2.055555555555556e-08, "loss": 0.0006, "num_tokens": 2975122.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 184.53703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.012690302915871143, "kl": 0.2306986004114151, "learning_rate": 2e-08, "loss": 0.0115, "num_tokens": 2975424.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 184.55555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.024331066757440567, "kl": 0.005846029380336404, "learning_rate": 1.9444444444444445e-08, "loss": 0.0003, "num_tokens": 2975730.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 184.57407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.015938330441713333, "kl": 0.002300865948200226, "learning_rate": 1.888888888888889e-08, "loss": 0.0001, "num_tokens": 2975936.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.25, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 13.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 184.59259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0426902212202549, "kl": 0.003031059866771102, "learning_rate": 1.8333333333333335e-08, "loss": 0.0001, "num_tokens": 2976201.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.75, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.75, "completions/mean_terminated_length": 20.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 184.61111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.1588970422744751, "kl": 0.09675305336713791, "learning_rate": 1.777777777777778e-08, "loss": 0.0048, "num_tokens": 2976512.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.75, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 184.62962962962962, "frac_reward_zero_std": 1.0, "grad_norm": 0.040012478828430176, "kl": 0.027997653931379318, "learning_rate": 1.7222222222222222e-08, "loss": 0.0014, "num_tokens": 2976731.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9970 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.014705882407724857, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014705882407724857, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.64814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 3.466801166534424, "kl": 0.08145990036427975, "learning_rate": 1.666666666666667e-08, "loss": -0.0164, "num_tokens": 2977016.0, "reward": 6.875, "reward_std": 2.25, "rewards/reward_combined/mean": 6.875, "rewards/reward_combined/std": 2.25, "step": 9971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 184.66666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 2.783857822418213, "kl": 0.42891138046979904, "learning_rate": 1.6111111111111112e-08, "loss": 0.0233, "num_tokens": 2977365.0, "reward": 5.125, "reward_std": 3.4731109142303467, "rewards/reward_combined/mean": 5.125, "rewards/reward_combined/std": 3.4731109142303467, "step": 9972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 184.6851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1304885298013687, "kl": 0.07356803491711617, "learning_rate": 1.5555555555555557e-08, "loss": 0.0038, "num_tokens": 2977644.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 184.7037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.00794475432485342, "kl": 0.0032549090683460236, "learning_rate": 1.5000000000000002e-08, "loss": 0.0002, "num_tokens": 2977904.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 184.72222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 2.7088069915771484, "kl": 0.30865050479769707, "learning_rate": 1.4444444444444445e-08, "loss": -0.127, "num_tokens": 2978218.0, "reward": 3.25, "reward_std": 1.5, "rewards/reward_combined/mean": 3.25, "rewards/reward_combined/std": 1.5, "step": 9975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 184.74074074074073, "frac_reward_zero_std": 1.0, "grad_norm": 0.03795822709798813, "kl": 0.27933743596076965, "learning_rate": 1.388888888888889e-08, "loss": 0.014, "num_tokens": 2978507.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 184.75925925925927, "frac_reward_zero_std": 1.0, "grad_norm": 0.011764636263251305, "kl": 0.008241906762123108, "learning_rate": 1.3333333333333334e-08, "loss": 0.0004, "num_tokens": 2978743.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_combined/mean": 4.0, "rewards/reward_combined/std": 0.0, "step": 9977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.25, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 15.25, "completions/mean_terminated_length": 15.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 184.77777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.018023120239377022, "kl": 0.031543536111712456, "learning_rate": 1.277777777777778e-08, "loss": 0.0016, "num_tokens": 2979036.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 184.7962962962963, "frac_reward_zero_std": 1.0, "grad_norm": 0.011541374959051609, "kl": 0.0015352022019214928, "learning_rate": 1.2222222222222224e-08, "loss": 0.0001, "num_tokens": 2979358.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 33.25, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 184.8148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 2.861678123474121, "kl": 0.2231302559375763, "learning_rate": 1.1666666666666669e-08, "loss": 0.0116, "num_tokens": 2979715.0, "reward": 5.75, "reward_std": 2.598076105117798, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 2.598076105117798, "step": 9980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 184.83333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.07583395391702652, "kl": 0.01668160129338503, "learning_rate": 1.1111111111111112e-08, "loss": 0.0008, "num_tokens": 2980045.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9981 }, { "clip_ratio/high_max": 0.006666666828095913, "clip_ratio/high_mean": 0.006666666828095913, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006666666828095913, "completion_length": 36.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 184.85185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.1910253763198853, "kl": 0.1525469496846199, "learning_rate": 1.0555555555555555e-08, "loss": 0.0035, "num_tokens": 2980417.0, "reward": 6.25, "reward_std": 3.5, "rewards/reward_combined/mean": 6.25, "rewards/reward_combined/std": 3.5, "step": 9982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.75, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 64.75, "completions/mean_terminated_length": 64.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 184.87037037037038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0560518354177475, "kl": 0.011888422537595034, "learning_rate": 1e-08, "loss": 0.0007, "num_tokens": 2980896.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 184.88888888888889, "frac_reward_zero_std": 1.0, "grad_norm": 0.03774970397353172, "kl": 0.010992761235684156, "learning_rate": 9.444444444444445e-09, "loss": 0.0005, "num_tokens": 2981176.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 184.90740740740742, "frac_reward_zero_std": 0.0, "grad_norm": 3.0903592109680176, "kl": 0.1625884808599949, "learning_rate": 8.88888888888889e-09, "loss": 0.1719, "num_tokens": 2981540.0, "reward": 4.75, "reward_std": 3.4034295082092285, "rewards/reward_combined/mean": 4.75, "rewards/reward_combined/std": 3.4034297466278076, "step": 9985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 184.92592592592592, "frac_reward_zero_std": 1.0, "grad_norm": 0.18772777915000916, "kl": 0.036159733310341835, "learning_rate": 8.333333333333335e-09, "loss": 0.0018, "num_tokens": 2981835.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 184.94444444444446, "frac_reward_zero_std": 1.0, "grad_norm": 0.027110587805509567, "kl": 0.4445309638977051, "learning_rate": 7.777777777777778e-09, "loss": 0.0222, "num_tokens": 2982119.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.5, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 10.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 184.96296296296296, "frac_reward_zero_std": 1.0, "grad_norm": 0.22838537395000458, "kl": 0.01982681709341705, "learning_rate": 7.222222222222223e-09, "loss": 0.001, "num_tokens": 2982381.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 184.9814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.06635617464780807, "kl": 0.025192919943947345, "learning_rate": 6.666666666666667e-09, "loss": 0.0015, "num_tokens": 2982684.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.75, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 40.75, "completions/mean_terminated_length": 40.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 185.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.09455134719610214, "kl": 0.07094737514853477, "learning_rate": 6.111111111111112e-09, "loss": 0.004, "num_tokens": 2983071.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 10.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 185.0185185185185, "frac_reward_zero_std": 1.0, "grad_norm": 0.020463278517127037, "kl": 0.0014620900037698448, "learning_rate": 5.555555555555556e-09, "loss": 0.0001, "num_tokens": 2983331.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 185.03703703703704, "frac_reward_zero_std": 1.0, "grad_norm": 0.011629568412899971, "kl": 0.0013190507888793945, "learning_rate": 5e-09, "loss": 0.0001, "num_tokens": 2983543.0, "reward": 3.5, "reward_std": 0.0, "rewards/reward_combined/mean": 3.5, "rewards/reward_combined/std": 0.0, "step": 9992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 185.05555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 6.849770579719916e-05, "kl": 5.7891011238098145e-06, "learning_rate": 4.444444444444445e-09, "loss": 0.0, "num_tokens": 2983763.0, "reward": 3.0, "reward_std": 0.0, "rewards/reward_combined/mean": 3.0, "rewards/reward_combined/std": 0.0, "step": 9993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 185.07407407407408, "frac_reward_zero_std": 1.0, "grad_norm": 0.11228924244642258, "kl": 0.020551201421767473, "learning_rate": 3.888888888888889e-09, "loss": 0.001, "num_tokens": 2984050.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 185.09259259259258, "frac_reward_zero_std": 1.0, "grad_norm": 0.02327764965593815, "kl": 0.010761368088424206, "learning_rate": 3.3333333333333334e-09, "loss": 0.0005, "num_tokens": 2984345.0, "reward": 7.5, "reward_std": 0.0, "rewards/reward_combined/mean": 7.5, "rewards/reward_combined/std": 0.0, "step": 9995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 185.11111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.055099062621593475, "kl": 0.1737062782049179, "learning_rate": 2.777777777777778e-09, "loss": 0.0087, "num_tokens": 2984629.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 21.25, "completions/mean_terminated_length": 21.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 185.12962962962962, "frac_reward_zero_std": 0.0, "grad_norm": 3.5090417861938477, "kl": 0.11760040000081062, "learning_rate": 2.2222222222222225e-09, "loss": -0.0156, "num_tokens": 2984934.0, "reward": 5.75, "reward_std": 3.5, "rewards/reward_combined/mean": 5.75, "rewards/reward_combined/std": 3.5, "step": 9997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 185.14814814814815, "frac_reward_zero_std": 1.0, "grad_norm": 0.043324749916791916, "kl": 0.014170304406434298, "learning_rate": 1.6666666666666667e-09, "loss": 0.0007, "num_tokens": 2985204.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.25, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 185.16666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.05552287772297859, "kl": 0.011619855649769306, "learning_rate": 1.1111111111111113e-09, "loss": 0.0006, "num_tokens": 2985477.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 9999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 185.1851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.025490732863545418, "kl": 0.0024291907902806997, "learning_rate": 5.555555555555556e-10, "loss": 0.0001, "num_tokens": 2985788.0, "reward": 8.0, "reward_std": 0.0, "rewards/reward_combined/mean": 8.0, "rewards/reward_combined/std": 0.0, "step": 10000 } ], "logging_steps": 1, "max_steps": 10000, "num_input_tokens_seen": 2985788, "num_train_epochs": 186, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }