{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0848626286199215, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1253.625, "completions/mean_terminated_length": 837.5238037109375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.00010607828577490188, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.0, "learning_rate": 8e-06, "loss": -0.0, "num_tokens": 97652.0, "reward": 1.7952632904052734, "reward_std": 1.2897486686706543, "rewards/reward_fn/mean": 1.7952632904052734, "rewards/reward_fn/std": 1.2897486686706543, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 163.46875, "completions/mean_terminated_length": 163.46875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.00021215657154980376, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.014410574920475483, "learning_rate": 7.9996e-06, "loss": 0.0006, "num_tokens": 133123.0, "reward": 2.7671689987182617, "reward_std": 0.028316717594861984, "rewards/reward_fn/mean": 2.7671689987182617, "rewards/reward_fn/std": 0.02831670455634594, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 61.9375, "completions/mean_terminated_length": 61.9375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.0003182348573247056, "frac_reward_zero_std": 1.0, "grad_norm": 0.498046875, "kl": 0.039460036728996783, "learning_rate": 7.9992e-06, "loss": 0.0016, "num_tokens": 171169.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 178.46875, "completions/mean_terminated_length": 178.46875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.0004243131430996075, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.012384430738165975, "learning_rate": 7.9988e-06, "loss": 0.0005, "num_tokens": 217520.0, "reward": 2.901700258255005, "reward_std": 0.015337609685957432, "rewards/reward_fn/mean": 2.901700258255005, "rewards/reward_fn/std": 0.015337574295699596, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 168.71875, "completions/mean_terminated_length": 168.71875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.0005303914288745094, "frac_reward_zero_std": 1.0, "grad_norm": 0.1572265625, "kl": 0.02899795339908451, "learning_rate": 7.9984e-06, "loss": 0.0012, "num_tokens": 255751.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 148.59375, "completions/mean_terminated_length": 148.59375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.0006364697146494112, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "kl": 0.0448116734623909, "learning_rate": 7.998e-06, "loss": 0.0018, "num_tokens": 290010.0, "reward": 3.875, "reward_std": 0.7071067690849304, "rewards/reward_fn/mean": 3.875, "rewards/reward_fn/std": 0.7071067690849304, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 243.5, "completions/mean_terminated_length": 243.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.0007425480004243131, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.026772579178214073, "learning_rate": 7.9976e-06, "loss": 0.0011, "num_tokens": 331850.0, "reward": 3.281740188598633, "reward_std": 0.5656248331069946, "rewards/reward_fn/mean": 3.281740188598633, "rewards/reward_fn/std": 0.5656247735023499, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 193.84375, "completions/mean_terminated_length": 193.84375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.000848626286199215, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.014506876847008243, "learning_rate": 7.9972e-06, "loss": 0.0006, "num_tokens": 380037.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 104.21875, "completions/mean_terminated_length": 104.21875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.0009547045719741168, "frac_reward_zero_std": 1.0, "grad_norm": 0.2109375, "kl": 0.048403532593511045, "learning_rate": 7.9968e-06, "loss": 0.0019, "num_tokens": 415468.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.0010607828577490189, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.02490026137093082, "learning_rate": 7.9964e-06, "loss": 0.001, "num_tokens": 478956.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 75.84375, "completions/mean_terminated_length": 75.84375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.0011668611435239206, "frac_reward_zero_std": 1.0, "grad_norm": 0.26953125, "kl": 0.07843554252758622, "learning_rate": 7.996e-06, "loss": 0.0031, "num_tokens": 518119.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 195.5625, "completions/mean_terminated_length": 195.5625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.0012729394292988225, "frac_reward_zero_std": 1.0, "grad_norm": 0.1611328125, "kl": 0.03463225986342877, "learning_rate": 7.995599999999998e-06, "loss": 0.0014, "num_tokens": 556985.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 457.15625, "completions/mean_terminated_length": 457.15625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.0013790177150737244, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.02525286824675277, "learning_rate": 7.9952e-06, "loss": 0.001, "num_tokens": 619038.0, "reward": 3.6924948692321777, "reward_std": 0.541310727596283, "rewards/reward_fn/mean": 3.6924948692321777, "rewards/reward_fn/std": 0.541310727596283, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 625.96875, "completions/mean_terminated_length": 625.96875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.0014850960008486263, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.020447831630008295, "learning_rate": 7.9948e-06, "loss": 0.0008, "num_tokens": 679165.0, "reward": 2.675490617752075, "reward_std": 0.3050808012485504, "rewards/reward_fn/mean": 2.675490617752075, "rewards/reward_fn/std": 0.3050808012485504, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 187.84375, "completions/mean_terminated_length": 187.84375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.0015911742866235282, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.04133811534848064, "learning_rate": 7.9944e-06, "loss": 0.0017, "num_tokens": 724184.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 193.4375, "completions/mean_terminated_length": 193.4375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.00169725257239843, "frac_reward_zero_std": 0.0, "grad_norm": 90.5, "kl": 0.042876197723671794, "learning_rate": 7.994e-06, "loss": 0.0017, "num_tokens": 761606.0, "reward": 2.927339792251587, "reward_std": 0.06646312773227692, "rewards/reward_fn/mean": 2.927339792251587, "rewards/reward_fn/std": 0.06646312773227692, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 171.4375, "completions/mean_terminated_length": 171.4375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.001803330858173332, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.026966359262587503, "learning_rate": 7.9936e-06, "loss": 0.0011, "num_tokens": 805396.0, "reward": 2.997476577758789, "reward_std": 0.1855943202972412, "rewards/reward_fn/mean": 2.997476577758789, "rewards/reward_fn/std": 0.1855943202972412, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 118.40625, "completions/mean_terminated_length": 118.40625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.0019094091439482337, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.029081921704346314, "learning_rate": 7.9932e-06, "loss": 0.0012, "num_tokens": 829697.0, "reward": 3.9648869037628174, "reward_std": 0.19862982630729675, "rewards/reward_fn/mean": 3.9648869037628174, "rewards/reward_fn/std": 0.19862982630729675, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 300.71875, "completions/mean_terminated_length": 300.71875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.002015487429723136, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.022967668395722285, "learning_rate": 7.992799999999999e-06, "loss": 0.0009, "num_tokens": 875000.0, "reward": 3.8433151245117188, "reward_std": 0.42122551798820496, "rewards/reward_fn/mean": 3.8433151245117188, "rewards/reward_fn/std": 0.42122551798820496, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 129.53125, "completions/mean_terminated_length": 129.53125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.0021215657154980377, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.027954923105426133, "learning_rate": 7.9924e-06, "loss": 0.0011, "num_tokens": 902569.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 145.59375, "completions/mean_terminated_length": 145.59375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.0022276440012729396, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.021872296027140692, "learning_rate": 7.991999999999999e-06, "loss": 0.0009, "num_tokens": 922172.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 359.15625, "completions/mean_terminated_length": 359.15625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.002333722287047841, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.01314439470297657, "learning_rate": 7.9916e-06, "loss": 0.0005, "num_tokens": 986177.0, "reward": 2.712355613708496, "reward_std": 0.04796939715743065, "rewards/reward_fn/mean": 2.712355613708496, "rewards/reward_fn/std": 0.04796938970685005, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 394.5625, "completions/mean_terminated_length": 394.5625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.002439800572822743, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.022076091496273875, "learning_rate": 7.991199999999999e-06, "loss": 0.0009, "num_tokens": 1037555.0, "reward": 3.5815935134887695, "reward_std": 0.5874396562576294, "rewards/reward_fn/mean": 3.5815935134887695, "rewards/reward_fn/std": 0.5874396562576294, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 312.3125, "completions/mean_terminated_length": 312.3125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.002545878858597645, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.022688235039822757, "learning_rate": 7.9908e-06, "loss": 0.0009, "num_tokens": 1094173.0, "reward": 2.881844997406006, "reward_std": 0.045588310807943344, "rewards/reward_fn/mean": 2.881844997406006, "rewards/reward_fn/std": 0.04558834806084633, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.002651957144372547, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.012471042369725183, "learning_rate": 7.9904e-06, "loss": 0.0005, "num_tokens": 1146825.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 197.0, "completions/mean_terminated_length": 197.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.0027580354301474487, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.03129113302566111, "learning_rate": 7.99e-06, "loss": 0.0013, "num_tokens": 1187497.0, "reward": 2.8278141021728516, "reward_std": 0.21510717272758484, "rewards/reward_fn/mean": 2.8278141021728516, "rewards/reward_fn/std": 0.21510712802410126, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0028641137159223506, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.037364296382293105, "learning_rate": 7.9896e-06, "loss": 0.0015, "num_tokens": 1220649.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 160.40625, "completions/mean_terminated_length": 160.40625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.0029701920016972526, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.04326166835380718, "learning_rate": 7.9892e-06, "loss": 0.0017, "num_tokens": 1274870.0, "reward": 3.8869001865386963, "reward_std": 0.35727038979530334, "rewards/reward_fn/mean": 3.8869001865386963, "rewards/reward_fn/std": 0.35727038979530334, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 368.28125, "completions/mean_terminated_length": 368.28125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.0030762702874721545, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.02703940650098957, "learning_rate": 7.9888e-06, "loss": 0.0011, "num_tokens": 1327839.0, "reward": 2.6013264656066895, "reward_std": 0.3262932300567627, "rewards/reward_fn/mean": 2.6013264656066895, "rewards/reward_fn/std": 0.3262932002544403, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 377.5, "completions/mean_terminated_length": 377.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.0031823485732470564, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.021802546572871506, "learning_rate": 7.9884e-06, "loss": 0.0009, "num_tokens": 1356015.0, "reward": 2.6662087440490723, "reward_std": 0.4428122639656067, "rewards/reward_fn/mean": 2.6662087440490723, "rewards/reward_fn/std": 0.4428122341632843, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 198.46875, "completions/mean_terminated_length": 198.46875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.0032884268590219583, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.02761831966927275, "learning_rate": 7.988e-06, "loss": 0.0011, "num_tokens": 1397790.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 202.78125, "completions/mean_terminated_length": 202.78125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.00339450514479686, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.040427666739560664, "learning_rate": 7.9876e-06, "loss": 0.0016, "num_tokens": 1441143.0, "reward": 2.7268662452697754, "reward_std": 0.034821733832359314, "rewards/reward_fn/mean": 2.7268662452697754, "rewards/reward_fn/std": 0.034821704030036926, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 204.6875, "completions/mean_terminated_length": 204.6875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.003500583430571762, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.018971540295751765, "learning_rate": 7.987199999999999e-06, "loss": 0.0008, "num_tokens": 1482093.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 302.71875, "completions/mean_terminated_length": 302.71875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.003606661716346664, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.030103662895271555, "learning_rate": 7.9868e-06, "loss": 0.0012, "num_tokens": 1529924.0, "reward": 3.224987268447876, "reward_std": 0.5399196147918701, "rewards/reward_fn/mean": 3.224987268447876, "rewards/reward_fn/std": 0.5399196147918701, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 281.1875, "completions/mean_terminated_length": 281.1875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.003712740002121566, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.03495570027735084, "learning_rate": 7.986399999999999e-06, "loss": 0.0014, "num_tokens": 1571626.0, "reward": 2.5815320014953613, "reward_std": 0.1874743103981018, "rewards/reward_fn/mean": 2.5815320014953613, "rewards/reward_fn/std": 0.187474325299263, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 231.21875, "completions/mean_terminated_length": 231.21875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.0038188182878964674, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.04162114462815225, "learning_rate": 7.986e-06, "loss": 0.0017, "num_tokens": 1614289.0, "reward": 2.7709145545959473, "reward_std": 0.020474720746278763, "rewards/reward_fn/mean": 2.7709145545959473, "rewards/reward_fn/std": 0.02047473005950451, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.00392489657367137, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.048726535169407725, "learning_rate": 7.9856e-06, "loss": 0.0019, "num_tokens": 1676041.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.004030974859446272, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.015456413209903985, "learning_rate": 7.9852e-06, "loss": 0.0006, "num_tokens": 1725761.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 331.75, "completions/mean_terminated_length": 331.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.0041370531452211735, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.03350905637489632, "learning_rate": 7.9848e-06, "loss": 0.0013, "num_tokens": 1775257.0, "reward": 2.9837241172790527, "reward_std": 0.28284090757369995, "rewards/reward_fn/mean": 2.9837241172790527, "rewards/reward_fn/std": 0.28284087777137756, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 178.8125, "completions/mean_terminated_length": 178.8125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.0042431314309960754, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.026478524028789252, "learning_rate": 7.9844e-06, "loss": 0.0011, "num_tokens": 1817203.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 196.46875, "completions/mean_terminated_length": 196.46875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.004349209716770977, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.030847460555378348, "learning_rate": 7.984e-06, "loss": 0.0012, "num_tokens": 1857922.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 671.46875, "completions/mean_terminated_length": 671.46875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.004455288002545879, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.012836619760491885, "learning_rate": 7.9836e-06, "loss": 0.0005, "num_tokens": 1915249.0, "reward": 1.707290530204773, "reward_std": 0.014385012909770012, "rewards/reward_fn/mean": 1.707290530204773, "rewards/reward_fn/std": 0.01438502874225378, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 239.125, "completions/mean_terminated_length": 239.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.00456136628832078, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.018241140787722543, "learning_rate": 7.9832e-06, "loss": 0.0007, "num_tokens": 1965141.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 195.03125, "completions/mean_terminated_length": 195.03125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.004667444574095682, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.027864656323799863, "learning_rate": 7.9828e-06, "loss": 0.0011, "num_tokens": 2010294.0, "reward": 2.832035541534424, "reward_std": 0.018766071647405624, "rewards/reward_fn/mean": 2.832035541534424, "rewards/reward_fn/std": 0.018766086548566818, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 114.28125, "completions/mean_terminated_length": 114.28125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.004773522859870584, "frac_reward_zero_std": 1.0, "grad_norm": 0.177734375, "kl": 0.03448771417606622, "learning_rate": 7.9824e-06, "loss": 0.0014, "num_tokens": 2032671.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 73.875, "completions/mean_terminated_length": 73.875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.004879601145645486, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.02053578832419589, "learning_rate": 7.981999999999999e-06, "loss": 0.0008, "num_tokens": 2077403.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 381.21875, "completions/mean_terminated_length": 381.21875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.004985679431420388, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.028637878014706075, "learning_rate": 7.9816e-06, "loss": 0.0011, "num_tokens": 2126082.0, "reward": 2.7234487533569336, "reward_std": 0.29123303294181824, "rewards/reward_fn/mean": 2.7234487533569336, "rewards/reward_fn/std": 0.29123303294181824, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 305.09375, "completions/mean_terminated_length": 305.09375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.00509175771719529, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.018045106873614714, "learning_rate": 7.9812e-06, "loss": 0.0007, "num_tokens": 2184005.0, "reward": 3.9637742042541504, "reward_std": 0.20492403209209442, "rewards/reward_fn/mean": 3.9637742042541504, "rewards/reward_fn/std": 0.20492400228977203, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 523.65625, "completions/mean_terminated_length": 523.65625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.005197836002970192, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.0190808747720439, "learning_rate": 7.9808e-06, "loss": 0.0008, "num_tokens": 2236858.0, "reward": 2.6711881160736084, "reward_std": 0.7798165082931519, "rewards/reward_fn/mean": 2.6711881160736084, "rewards/reward_fn/std": 0.7798165678977966, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 197.03125, "completions/mean_terminated_length": 197.03125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.005303914288745094, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.030177024600561708, "learning_rate": 7.9804e-06, "loss": 0.0012, "num_tokens": 2276123.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 302.34375, "completions/mean_terminated_length": 302.34375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.0054099925745199956, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.025053055898752064, "learning_rate": 7.98e-06, "loss": 0.001, "num_tokens": 2327142.0, "reward": 2.815532922744751, "reward_std": 0.061222709715366364, "rewards/reward_fn/mean": 2.815532922744751, "rewards/reward_fn/std": 0.06122272461652756, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 207.6875, "completions/mean_terminated_length": 207.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.0055160708602948975, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.043745150382164866, "learning_rate": 7.979599999999999e-06, "loss": 0.0017, "num_tokens": 2362652.0, "reward": 3.8098015785217285, "reward_std": 0.449843168258667, "rewards/reward_fn/mean": 3.8098015785217285, "rewards/reward_fn/std": 0.449843168258667, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 177.59375, "completions/mean_terminated_length": 177.59375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.005622149146069799, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.030419682036153972, "learning_rate": 7.9792e-06, "loss": 0.0012, "num_tokens": 2413711.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 379.125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.005728227431844701, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.03096459503285587, "learning_rate": 7.978799999999999e-06, "loss": 0.0012, "num_tokens": 2461139.0, "reward": 2.792288303375244, "reward_std": 0.22959093749523163, "rewards/reward_fn/mean": 2.792288303375244, "rewards/reward_fn/std": 0.22959090769290924, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 899.3125, "completions/mean_terminated_length": 862.258056640625, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.005834305717619603, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.013051759175141342, "learning_rate": 7.9784e-06, "loss": 0.0005, "num_tokens": 2527389.0, "reward": 2.574242115020752, "reward_std": 0.390576034784317, "rewards/reward_fn/mean": 2.574242115020752, "rewards/reward_fn/std": 0.39057594537734985, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 341.96875, "completions/mean_terminated_length": 341.96875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.005940384003394505, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.01978624120238237, "learning_rate": 7.977999999999999e-06, "loss": 0.0008, "num_tokens": 2569180.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 183.34375, "completions/mean_terminated_length": 183.34375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.006046462289169407, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.03518621769035235, "learning_rate": 7.9776e-06, "loss": 0.0014, "num_tokens": 2628679.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 178.15625, "completions/mean_terminated_length": 178.15625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.006152540574944309, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.010592784703476354, "learning_rate": 7.977199999999999e-06, "loss": 0.0004, "num_tokens": 2680684.0, "reward": 3.1400744915008545, "reward_std": 0.01094813086092472, "rewards/reward_fn/mean": 3.1400744915008545, "rewards/reward_fn/std": 0.010948143899440765, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.006258618860719211, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0406050537712872, "learning_rate": 7.9768e-06, "loss": 0.0016, "num_tokens": 2721064.0, "reward": 2.754612922668457, "reward_std": 0.024560745805501938, "rewards/reward_fn/mean": 2.754612922668457, "rewards/reward_fn/std": 0.02456069365143776, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 260.6875, "completions/mean_terminated_length": 260.6875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.006364697146494113, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.03231381287332624, "learning_rate": 7.9764e-06, "loss": 0.0013, "num_tokens": 2783166.0, "reward": 2.8348751068115234, "reward_std": 0.021478639915585518, "rewards/reward_fn/mean": 2.8348751068115234, "rewards/reward_fn/std": 0.0214786846190691, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 177.40625, "completions/mean_terminated_length": 177.40625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.006470775432269015, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.03543406492099166, "learning_rate": 7.976e-06, "loss": 0.0014, "num_tokens": 2835147.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 298.71875, "completions/mean_terminated_length": 298.71875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.0065768537180439166, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.02661313521093689, "learning_rate": 7.9756e-06, "loss": 0.0011, "num_tokens": 2874050.0, "reward": 2.8492093086242676, "reward_std": 0.18044866621494293, "rewards/reward_fn/mean": 2.8492093086242676, "rewards/reward_fn/std": 0.18044860661029816, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.0066829320038188185, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.02399728394811973, "learning_rate": 7.9752e-06, "loss": 0.001, "num_tokens": 2914209.0, "reward": 3.966371774673462, "reward_std": 0.19022996723651886, "rewards/reward_fn/mean": 3.966371774673462, "rewards/reward_fn/std": 0.19022996723651886, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 160.90625, "completions/mean_terminated_length": 160.90625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.00678901028959372, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.021722051285905764, "learning_rate": 7.9748e-06, "loss": 0.0009, "num_tokens": 2953118.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 327.25, "completions/mean_terminated_length": 327.25, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.006895088575368622, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.026302733516786247, "learning_rate": 7.9744e-06, "loss": 0.0011, "num_tokens": 3001318.0, "reward": 3.931765079498291, "reward_std": 0.26856717467308044, "rewards/reward_fn/mean": 3.931765079498291, "rewards/reward_fn/std": 0.26856720447540283, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 387.625, "completions/mean_terminated_length": 387.625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.007001166861143524, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.026840948354220018, "learning_rate": 7.974e-06, "loss": 0.0011, "num_tokens": 3080314.0, "reward": 3.5707976818084717, "reward_std": 0.9079517722129822, "rewards/reward_fn/mean": 3.5707976818084717, "rewards/reward_fn/std": 0.9079517126083374, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 292.9375, "completions/mean_terminated_length": 292.9375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.007107245146918426, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.026440918241860345, "learning_rate": 7.9736e-06, "loss": 0.0011, "num_tokens": 3126872.0, "reward": 3.963066339492798, "reward_std": 0.2089284509420395, "rewards/reward_fn/mean": 3.963066339492798, "rewards/reward_fn/std": 0.20892846584320068, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 238.3125, "completions/mean_terminated_length": 238.3125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.007213323432693328, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.04070335888536647, "learning_rate": 7.9732e-06, "loss": 0.0016, "num_tokens": 3171234.0, "reward": 2.782970905303955, "reward_std": 0.2698141932487488, "rewards/reward_fn/mean": 2.782970905303955, "rewards/reward_fn/std": 0.2698141932487488, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 145.9375, "completions/mean_terminated_length": 145.9375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.00731940171846823, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.039589619380421937, "learning_rate": 7.9728e-06, "loss": 0.0016, "num_tokens": 3219968.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 159.4375, "completions/mean_terminated_length": 159.4375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.007425480004243132, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.044402167259249836, "learning_rate": 7.9724e-06, "loss": 0.0018, "num_tokens": 3261806.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 536.28125, "completions/mean_terminated_length": 536.28125, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.007531558290018034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0537109375, "kl": 0.017754276690538973, "learning_rate": 7.972e-06, "loss": 0.0007, "num_tokens": 3326647.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 395.625, "completions/mean_terminated_length": 395.625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.007637636575792935, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.02606621669838205, "learning_rate": 7.9716e-06, "loss": 0.001, "num_tokens": 3371147.0, "reward": 2.7468667030334473, "reward_std": 0.029990501701831818, "rewards/reward_fn/mean": 2.7468667030334473, "rewards/reward_fn/std": 0.029990488663315773, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 129.21875, "completions/mean_terminated_length": 129.21875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.007743714861567837, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796875, "kl": 0.035898417234420776, "learning_rate": 7.9712e-06, "loss": 0.0014, "num_tokens": 3416178.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 210.5, "completions/mean_terminated_length": 210.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.00784979314734274, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.0296817320631817, "learning_rate": 7.9708e-06, "loss": 0.0012, "num_tokens": 3470530.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 286.75, "completions/mean_terminated_length": 286.75, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.00795587143311764, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.02990246523404494, "learning_rate": 7.970399999999999e-06, "loss": 0.0012, "num_tokens": 3535866.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 66.375, "completions/mean_terminated_length": 66.375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.008061949718892543, "frac_reward_zero_std": 1.0, "grad_norm": 0.3203125, "kl": 0.04533489595633, "learning_rate": 7.97e-06, "loss": 0.0018, "num_tokens": 3588262.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 330.9375, "completions/mean_terminated_length": 330.9375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.008168028004667444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.023781975061865523, "learning_rate": 7.969599999999999e-06, "loss": 0.001, "num_tokens": 3637028.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 84.1875, "completions/mean_terminated_length": 84.1875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.008274106290442347, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.04825737385544926, "learning_rate": 7.9692e-06, "loss": 0.0019, "num_tokens": 3682186.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 352.6875, "completions/mean_terminated_length": 352.6875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.008380184576217248, "frac_reward_zero_std": 1.0, "grad_norm": 0.061767578125, "kl": 0.01972204475896433, "learning_rate": 7.968799999999999e-06, "loss": 0.0008, "num_tokens": 3765056.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 293.71875, "completions/mean_terminated_length": 293.71875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.008486262861992151, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.01990896329516545, "learning_rate": 7.9684e-06, "loss": 0.0008, "num_tokens": 3821559.0, "reward": 2.8117334842681885, "reward_std": 0.027212122455239296, "rewards/reward_fn/mean": 2.8117334842681885, "rewards/reward_fn/std": 0.027212098240852356, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 158.65625, "completions/mean_terminated_length": 158.65625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.008592341147767052, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.03137456753756851, "learning_rate": 7.967999999999999e-06, "loss": 0.0013, "num_tokens": 3855596.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 185.59375, "completions/mean_terminated_length": 185.59375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.008698419433541955, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.04182742937700823, "learning_rate": 7.9676e-06, "loss": 0.0017, "num_tokens": 3903775.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 245.65625, "completions/mean_terminated_length": 245.65625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.008804497719316856, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.02975867863278836, "learning_rate": 7.967199999999999e-06, "loss": 0.0012, "num_tokens": 3946388.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 163.53125, "completions/mean_terminated_length": 163.53125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.008910576005091759, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.0318527152412571, "learning_rate": 7.9668e-06, "loss": 0.0013, "num_tokens": 3979045.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 284.46875, "completions/mean_terminated_length": 284.46875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.00901665429086666, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.025949590606614947, "learning_rate": 7.9664e-06, "loss": 0.001, "num_tokens": 4022452.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 154.96875, "completions/mean_terminated_length": 154.96875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.00912273257664156, "frac_reward_zero_std": 1.0, "grad_norm": 0.1513671875, "kl": 0.044943351356778294, "learning_rate": 7.966e-06, "loss": 0.0018, "num_tokens": 4055379.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.009228810862416463, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.03375592204974964, "learning_rate": 7.9656e-06, "loss": 0.0014, "num_tokens": 4095131.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 232.5, "completions/mean_terminated_length": 232.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.009334889148191364, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.040917065110988915, "learning_rate": 7.9652e-06, "loss": 0.0016, "num_tokens": 4133579.0, "reward": 3.385199785232544, "reward_std": 0.6248396635055542, "rewards/reward_fn/mean": 3.385199785232544, "rewards/reward_fn/std": 0.6248396635055542, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.009440967433966267, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.03094743611291051, "learning_rate": 7.9648e-06, "loss": 0.0012, "num_tokens": 4168399.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 254.75, "completions/mean_terminated_length": 254.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.009547045719741168, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.020157222083071247, "learning_rate": 7.9644e-06, "loss": 0.0008, "num_tokens": 4212263.0, "reward": 2.7611052989959717, "reward_std": 0.04510229453444481, "rewards/reward_fn/mean": 2.7611052989959717, "rewards/reward_fn/std": 0.0451023168861866, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 228.71875, "completions/mean_terminated_length": 228.71875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.009653124005516071, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.025626638293033466, "learning_rate": 7.964e-06, "loss": 0.001, "num_tokens": 4264894.0, "reward": 3.182257652282715, "reward_std": 0.5200350284576416, "rewards/reward_fn/mean": 3.182257652282715, "rewards/reward_fn/std": 0.5200349688529968, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 388.15625, "completions/mean_terminated_length": 388.15625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.009759202291290972, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.02052137372083962, "learning_rate": 7.9636e-06, "loss": 0.0008, "num_tokens": 4326723.0, "reward": 3.77706241607666, "reward_std": 0.5531520247459412, "rewards/reward_fn/mean": 3.77706241607666, "rewards/reward_fn/std": 0.5531519651412964, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 149.53125, "completions/mean_terminated_length": 149.53125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.009865280577065875, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.025205849786289036, "learning_rate": 7.963199999999999e-06, "loss": 0.001, "num_tokens": 4361044.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 967.4375, "completions/mean_terminated_length": 813.0714721679688, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.009971358862840776, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.01375104625185486, "learning_rate": 7.9628e-06, "loss": 0.0006, "num_tokens": 4422434.0, "reward": 2.3512930870056152, "reward_std": 0.9757513999938965, "rewards/reward_fn/mean": 2.3512930870056152, "rewards/reward_fn/std": 0.9757513999938965, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 264.34375, "completions/mean_terminated_length": 264.34375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.010077437148615679, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.03294429712696001, "learning_rate": 7.962399999999999e-06, "loss": 0.0013, "num_tokens": 4460301.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1142.59375, "completions/mean_terminated_length": 1142.59375, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 0.01018351543439058, "frac_reward_zero_std": 0.0, "grad_norm": 0.484375, "kl": 0.011026079504517838, "learning_rate": 7.962e-06, "loss": 0.0004, "num_tokens": 4529856.0, "reward": 2.711195468902588, "reward_std": 0.18870149552822113, "rewards/reward_fn/mean": 2.711195468902588, "rewards/reward_fn/std": 0.18870148062705994, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 215.78125, "completions/mean_terminated_length": 215.78125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.010289593720165482, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.02338853490073234, "learning_rate": 7.9616e-06, "loss": 0.0009, "num_tokens": 4571801.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 508.03125, "completions/mean_terminated_length": 508.03125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.010395672005940384, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.012433522861101665, "learning_rate": 7.9612e-06, "loss": 0.0005, "num_tokens": 4632890.0, "reward": 3.551553726196289, "reward_std": 0.6977203488349915, "rewards/reward_fn/mean": 3.551553726196289, "rewards/reward_fn/std": 0.6977203488349915, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 503.21875, "completions/mean_terminated_length": 503.21875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.010501750291715286, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.01728816461400129, "learning_rate": 7.9608e-06, "loss": 0.0007, "num_tokens": 4686113.0, "reward": 2.795738458633423, "reward_std": 0.05973564833402634, "rewards/reward_fn/mean": 2.795738458633423, "rewards/reward_fn/std": 0.05973568186163902, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 179.78125, "completions/mean_terminated_length": 179.78125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.010607828577490187, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.037204470427241176, "learning_rate": 7.9604e-06, "loss": 0.0015, "num_tokens": 4720314.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 248.46875, "completions/mean_terminated_length": 248.46875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.01071390686326509, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.027893651509657502, "learning_rate": 7.96e-06, "loss": 0.0011, "num_tokens": 4770729.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 218.3125, "completions/mean_terminated_length": 218.3125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.010819985149039991, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.027056844613980502, "learning_rate": 7.959599999999999e-06, "loss": 0.0011, "num_tokens": 4809267.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 85.6875, "completions/mean_terminated_length": 85.6875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.010926063434814894, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962890625, "kl": 0.015027127868961543, "learning_rate": 7.9592e-06, "loss": 0.0006, "num_tokens": 4831881.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 189.625, "completions/mean_terminated_length": 189.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.011032141720589795, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.02954344844329171, "learning_rate": 7.958799999999999e-06, "loss": 0.0012, "num_tokens": 4893149.0, "reward": 2.8656177520751953, "reward_std": 0.012954896315932274, "rewards/reward_fn/mean": 2.8656177520751953, "rewards/reward_fn/std": 0.012954906560480595, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 143.34375, "completions/mean_terminated_length": 143.34375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.011138220006364698, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.04420140647562221, "learning_rate": 7.9584e-06, "loss": 0.0018, "num_tokens": 4914216.0, "reward": 3.203092575073242, "reward_std": 0.031201422214508057, "rewards/reward_fn/mean": 3.203092575073242, "rewards/reward_fn/std": 0.031201381236314774, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.011244298292139599, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.024419172026682645, "learning_rate": 7.957999999999999e-06, "loss": 0.001, "num_tokens": 4955690.0, "reward": 2.354905843734741, "reward_std": 0.586423397064209, "rewards/reward_fn/mean": 2.354905843734741, "rewards/reward_fn/std": 0.586423397064209, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 202.71875, "completions/mean_terminated_length": 202.71875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.011350376577914502, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.02303978052805178, "learning_rate": 7.9576e-06, "loss": 0.0009, "num_tokens": 4998529.0, "reward": 3.2449896335601807, "reward_std": 0.5573272705078125, "rewards/reward_fn/mean": 3.2449896335601807, "rewards/reward_fn/std": 0.5573272109031677, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 62.75, "completions/mean_terminated_length": 62.75, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.011456454863689403, "frac_reward_zero_std": 1.0, "grad_norm": 0.2021484375, "kl": 0.02382526727160439, "learning_rate": 7.9572e-06, "loss": 0.001, "num_tokens": 5035353.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1877.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 1008.65625, "completions/mean_terminated_length": 1008.65625, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.011562533149464305, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.011780590401031077, "learning_rate": 7.9568e-06, "loss": 0.0005, "num_tokens": 5094030.0, "reward": 2.959881067276001, "reward_std": 0.9156700968742371, "rewards/reward_fn/mean": 2.959881067276001, "rewards/reward_fn/std": 0.9156700968742371, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 280.09375, "completions/mean_terminated_length": 280.09375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.011668611435239206, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.021801961964229122, "learning_rate": 7.9564e-06, "loss": 0.0009, "num_tokens": 5138897.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 207.96875, "completions/mean_terminated_length": 207.96875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.01177468972101411, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.03097967169014737, "learning_rate": 7.956e-06, "loss": 0.0012, "num_tokens": 5176816.0, "reward": 3.800929069519043, "reward_std": 0.5765722393989563, "rewards/reward_fn/mean": 3.800929069519043, "rewards/reward_fn/std": 0.5765722393989563, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 205.53125, "completions/mean_terminated_length": 205.53125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.01188076800678901, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.043233627162408084, "learning_rate": 7.955599999999999e-06, "loss": 0.0017, "num_tokens": 5227073.0, "reward": 2.8226916790008545, "reward_std": 0.274354487657547, "rewards/reward_fn/mean": 2.8226916790008545, "rewards/reward_fn/std": 0.2743545174598694, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 286.84375, "completions/mean_terminated_length": 286.84375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.011986846292563913, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.022725433052983135, "learning_rate": 7.9552e-06, "loss": 0.0009, "num_tokens": 5274268.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 77.78125, "completions/mean_terminated_length": 77.78125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.012092924578338814, "frac_reward_zero_std": 0.0, "grad_norm": 5.09375, "kl": 0.03250217955792323, "learning_rate": 7.954799999999999e-06, "loss": 0.0013, "num_tokens": 5313717.0, "reward": 3.0405046939849854, "reward_std": 0.030798695981502533, "rewards/reward_fn/mean": 3.0405046939849854, "rewards/reward_fn/std": 0.030798697844147682, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 346.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.012199002864113715, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.019344605010701343, "learning_rate": 7.9544e-06, "loss": 0.0008, "num_tokens": 5368593.0, "reward": 3.050971031188965, "reward_std": 0.5881122350692749, "rewards/reward_fn/mean": 3.050971031188965, "rewards/reward_fn/std": 0.5881122350692749, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 416.59375, "completions/mean_terminated_length": 416.59375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.012305081149888618, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.02247777715092525, "learning_rate": 7.953999999999999e-06, "loss": 0.0009, "num_tokens": 5419044.0, "reward": 2.8257861137390137, "reward_std": 0.029950875788927078, "rewards/reward_fn/mean": 2.8257861137390137, "rewards/reward_fn/std": 0.029950888827443123, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 392.9375, "completions/mean_terminated_length": 392.9375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.012411159435663519, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.02738574449904263, "learning_rate": 7.9536e-06, "loss": 0.0011, "num_tokens": 5474146.0, "reward": 3.4753129482269287, "reward_std": 0.6120408773422241, "rewards/reward_fn/mean": 3.4753129482269287, "rewards/reward_fn/std": 0.6120408773422241, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 239.21875, "completions/mean_terminated_length": 239.21875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.012517237721438422, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.02117563263163902, "learning_rate": 7.953199999999999e-06, "loss": 0.0008, "num_tokens": 5515369.0, "reward": 2.897916316986084, "reward_std": 0.20252078771591187, "rewards/reward_fn/mean": 2.897916316986084, "rewards/reward_fn/std": 0.20252081751823425, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 434.59375, "completions/mean_terminated_length": 434.59375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.012623316007213323, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.029518495866796002, "learning_rate": 7.9528e-06, "loss": 0.0012, "num_tokens": 5561532.0, "reward": 2.9283552169799805, "reward_std": 0.04649563878774643, "rewards/reward_fn/mean": 2.9283552169799805, "rewards/reward_fn/std": 0.046495646238327026, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 170.46875, "completions/mean_terminated_length": 170.46875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.012729394292988225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.031327863136539236, "learning_rate": 7.9524e-06, "loss": 0.0013, "num_tokens": 5597931.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 213.5625, "completions/mean_terminated_length": 213.5625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.012835472578763127, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.05756760714575648, "learning_rate": 7.952e-06, "loss": 0.0023, "num_tokens": 5639261.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 207.84375, "completions/mean_terminated_length": 207.84375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.01294155086453803, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796875, "kl": 0.05318293231539428, "learning_rate": 7.9516e-06, "loss": 0.0021, "num_tokens": 5662744.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 190.1875, "completions/mean_terminated_length": 190.1875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.01304762915031293, "frac_reward_zero_std": 1.0, "grad_norm": 0.16015625, "kl": 0.05158931959886104, "learning_rate": 7.9512e-06, "loss": 0.0021, "num_tokens": 5704862.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 184.1875, "completions/mean_terminated_length": 184.1875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.013153707436087833, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.048936213250271976, "learning_rate": 7.9508e-06, "loss": 0.002, "num_tokens": 5736100.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 410.5625, "completions/mean_terminated_length": 410.5625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.013259785721862734, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.03778961458010599, "learning_rate": 7.9504e-06, "loss": 0.0015, "num_tokens": 5796438.0, "reward": 3.253349542617798, "reward_std": 0.24341487884521484, "rewards/reward_fn/mean": 3.253349542617798, "rewards/reward_fn/std": 0.24341486394405365, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 238.21875, "completions/mean_terminated_length": 238.21875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.013365864007637637, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.03385805507423356, "learning_rate": 7.95e-06, "loss": 0.0014, "num_tokens": 5864573.0, "reward": 2.7737503051757812, "reward_std": 0.32643312215805054, "rewards/reward_fn/mean": 2.7737503051757812, "rewards/reward_fn/std": 0.32643312215805054, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 96.03125, "completions/mean_terminated_length": 96.03125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.013471942293412538, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962890625, "kl": 0.029550763370934874, "learning_rate": 7.9496e-06, "loss": 0.0012, "num_tokens": 5897438.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 169.96875, "completions/mean_terminated_length": 169.96875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.01357802057918744, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.03915034799138084, "learning_rate": 7.9492e-06, "loss": 0.0016, "num_tokens": 5940637.0, "reward": 3.2658727169036865, "reward_std": 0.14581863582134247, "rewards/reward_fn/mean": 3.2658727169036865, "rewards/reward_fn/std": 0.14581862092018127, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 145.78125, "completions/mean_terminated_length": 145.78125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.013684098864962342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.0380926345824264, "learning_rate": 7.9488e-06, "loss": 0.0015, "num_tokens": 5976566.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.013790177150737245, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.0405671053158585, "learning_rate": 7.9484e-06, "loss": 0.0016, "num_tokens": 6020598.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 80.375, "completions/mean_terminated_length": 80.375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.013896255436512146, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.03241157752927393, "learning_rate": 7.948e-06, "loss": 0.0013, "num_tokens": 6063330.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 430.46875, "completions/mean_terminated_length": 430.46875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.014002333722287048, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.024634861125377938, "learning_rate": 7.9476e-06, "loss": 0.001, "num_tokens": 6113649.0, "reward": 3.187520980834961, "reward_std": 0.4377913773059845, "rewards/reward_fn/mean": 3.187520980834961, "rewards/reward_fn/std": 0.4377914071083069, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 260.34375, "completions/mean_terminated_length": 260.34375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.01410841200806195, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.04265774926170707, "learning_rate": 7.947199999999999e-06, "loss": 0.0017, "num_tokens": 6137468.0, "reward": 3.061521530151367, "reward_std": 0.5167216062545776, "rewards/reward_fn/mean": 3.061521530151367, "rewards/reward_fn/std": 0.5167215466499329, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 107.59375, "completions/mean_terminated_length": 107.59375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.014214490293836852, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.022619035269599408, "learning_rate": 7.9468e-06, "loss": 0.0009, "num_tokens": 6175791.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.014320568579611753, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.026089665334438905, "learning_rate": 7.946399999999999e-06, "loss": 0.001, "num_tokens": 6213511.0, "reward": 2.8176023960113525, "reward_std": 0.3619799017906189, "rewards/reward_fn/mean": 2.8176023960113525, "rewards/reward_fn/std": 0.3619799017906189, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 180.71875, "completions/mean_terminated_length": 180.71875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.014426646865386656, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.040278222353663296, "learning_rate": 7.946e-06, "loss": 0.0016, "num_tokens": 6256158.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 459.96875, "completions/mean_terminated_length": 459.96875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.014532725151161557, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.024632773885969073, "learning_rate": 7.945599999999999e-06, "loss": 0.001, "num_tokens": 6306045.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 723.0, "completions/mean_terminated_length": 680.258056640625, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.01463880343693646, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.027096317993709818, "learning_rate": 7.9452e-06, "loss": 0.0011, "num_tokens": 6360445.0, "reward": 2.2733407020568848, "reward_std": 0.8242188096046448, "rewards/reward_fn/mean": 2.2733407020568848, "rewards/reward_fn/std": 0.8242188096046448, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1742.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 603.75, "completions/mean_terminated_length": 603.75, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.01474488172271136, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.02166763847344555, "learning_rate": 7.944799999999999e-06, "loss": 0.0009, "num_tokens": 6415573.0, "reward": 2.891838312149048, "reward_std": 0.06433243304491043, "rewards/reward_fn/mean": 2.891838312149048, "rewards/reward_fn/std": 0.06433244049549103, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 187.6875, "completions/mean_terminated_length": 187.6875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.014850960008486264, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.027505328936968, "learning_rate": 7.9444e-06, "loss": 0.0011, "num_tokens": 6463531.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 128.375, "completions/mean_terminated_length": 128.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.014957038294261165, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.026608427229803056, "learning_rate": 7.943999999999999e-06, "loss": 0.0011, "num_tokens": 6516215.0, "reward": 3.2211999893188477, "reward_std": 0.14292843639850616, "rewards/reward_fn/mean": 3.2211999893188477, "rewards/reward_fn/std": 0.14292845129966736, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 280.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.015063116580036067, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.047853924334049225, "learning_rate": 7.9436e-06, "loss": 0.0019, "num_tokens": 6559991.0, "reward": 2.7001118659973145, "reward_std": 0.026461286470294, "rewards/reward_fn/mean": 2.7001118659973145, "rewards/reward_fn/std": 0.026461288332939148, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 365.40625, "completions/mean_terminated_length": 365.40625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.015169194865810968, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.038805975636933, "learning_rate": 7.9432e-06, "loss": 0.0016, "num_tokens": 6606052.0, "reward": 2.986452579498291, "reward_std": 0.7081362009048462, "rewards/reward_fn/mean": 2.986452579498291, "rewards/reward_fn/std": 0.7081362009048462, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.01527527315158587, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.021103895822307095, "learning_rate": 7.9428e-06, "loss": 0.0008, "num_tokens": 6640832.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 235.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.015381351437360772, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0223095424589701, "learning_rate": 7.9424e-06, "loss": 0.0009, "num_tokens": 6686628.0, "reward": 3.0702104568481445, "reward_std": 0.591320812702179, "rewards/reward_fn/mean": 3.0702104568481445, "rewards/reward_fn/std": 0.5913207530975342, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 211.34375, "completions/mean_terminated_length": 211.34375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.015487429723135673, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.027028481126762927, "learning_rate": 7.942e-06, "loss": 0.0011, "num_tokens": 6731471.0, "reward": 3.1773626804351807, "reward_std": 0.22673729062080383, "rewards/reward_fn/mean": 3.1773626804351807, "rewards/reward_fn/std": 0.22673727571964264, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 220.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.015593508008910576, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.039822932973038405, "learning_rate": 7.9416e-06, "loss": 0.0016, "num_tokens": 6775379.0, "reward": 2.9473490715026855, "reward_std": 0.40440940856933594, "rewards/reward_fn/mean": 2.9473490715026855, "rewards/reward_fn/std": 0.40440940856933594, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 575.1875, "completions/mean_terminated_length": 575.1875, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.01569958629468548, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.023320580541621894, "learning_rate": 7.9412e-06, "loss": 0.0009, "num_tokens": 6835545.0, "reward": 2.5726113319396973, "reward_std": 0.3018931448459625, "rewards/reward_fn/mean": 2.5726113319396973, "rewards/reward_fn/std": 0.3018931448459625, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 104.375, "completions/mean_terminated_length": 104.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.015805664580460378, "frac_reward_zero_std": 1.0, "grad_norm": 0.21484375, "kl": 0.032563303771894425, "learning_rate": 7.9408e-06, "loss": 0.0013, "num_tokens": 6872037.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 155.8125, "completions/mean_terminated_length": 155.8125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.01591174286623528, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.04502526589203626, "learning_rate": 7.9404e-06, "loss": 0.0018, "num_tokens": 6905183.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 223.1875, "completions/mean_terminated_length": 223.1875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.016017821152010184, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.039221919083502144, "learning_rate": 7.94e-06, "loss": 0.0016, "num_tokens": 6945445.0, "reward": 3.901937961578369, "reward_std": 0.3098265826702118, "rewards/reward_fn/mean": 3.901937961578369, "rewards/reward_fn/std": 0.3098265528678894, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 338.75, "completions/mean_terminated_length": 338.75, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.016123899437785087, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.03526619606418535, "learning_rate": 7.9396e-06, "loss": 0.0014, "num_tokens": 6993149.0, "reward": 3.3024063110351562, "reward_std": 0.651430606842041, "rewards/reward_fn/mean": 3.3024063110351562, "rewards/reward_fn/std": 0.6514305472373962, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.016229977723559986, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.04323679523076862, "learning_rate": 7.939199999999998e-06, "loss": 0.0017, "num_tokens": 7032805.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 120.90625, "completions/mean_terminated_length": 120.90625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.01633605600933489, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.06397866155020893, "learning_rate": 7.9388e-06, "loss": 0.0025, "num_tokens": 7065250.0, "reward": 2.901592969894409, "reward_std": 0.022638218477368355, "rewards/reward_fn/mean": 2.901592969894409, "rewards/reward_fn/std": 0.02263822965323925, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.01644213429510979, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.06367940676864237, "learning_rate": 7.9384e-06, "loss": 0.0025, "num_tokens": 7111198.0, "reward": 3.8250930309295654, "reward_std": 0.3699547052383423, "rewards/reward_fn/mean": 3.8250930309295654, "rewards/reward_fn/std": 0.3699546456336975, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 264.8125, "completions/mean_terminated_length": 264.8125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.016548212580884694, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.048206568928435445, "learning_rate": 7.938e-06, "loss": 0.0019, "num_tokens": 7173912.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 114.21875, "completions/mean_terminated_length": 114.21875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.016654290866659593, "frac_reward_zero_std": 1.0, "grad_norm": 0.1806640625, "kl": 0.04626023437594995, "learning_rate": 7.9376e-06, "loss": 0.0019, "num_tokens": 7205311.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 117.5625, "completions/mean_terminated_length": 117.5625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.016760369152434496, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.023312504577916116, "learning_rate": 7.9372e-06, "loss": 0.0009, "num_tokens": 7239921.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 299.125, "completions/mean_terminated_length": 299.125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.0168664474382094, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.048653680307324976, "learning_rate": 7.9368e-06, "loss": 0.0019, "num_tokens": 7282933.0, "reward": 3.4297356605529785, "reward_std": 0.4795074164867401, "rewards/reward_fn/mean": 3.4297356605529785, "rewards/reward_fn/std": 0.4795074760913849, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 310.4375, "completions/mean_terminated_length": 310.4375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.016972525723984302, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.03421498887473717, "learning_rate": 7.936399999999999e-06, "loss": 0.0014, "num_tokens": 7322243.0, "reward": 2.825167179107666, "reward_std": 0.0166213046759367, "rewards/reward_fn/mean": 2.825167179107666, "rewards/reward_fn/std": 0.01662134751677513, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 223.84375, "completions/mean_terminated_length": 223.84375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.0170786040097592, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.048001356422901154, "learning_rate": 7.936e-06, "loss": 0.0019, "num_tokens": 7358174.0, "reward": 1.793137788772583, "reward_std": 0.006353580858558416, "rewards/reward_fn/mean": 1.793137788772583, "rewards/reward_fn/std": 0.006353587377816439, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 185.03125, "completions/mean_terminated_length": 185.03125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.017184682295534104, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.04288411437300965, "learning_rate": 7.935599999999999e-06, "loss": 0.0017, "num_tokens": 7395775.0, "reward": 2.704394817352295, "reward_std": 0.028164710849523544, "rewards/reward_fn/mean": 2.704394817352295, "rewards/reward_fn/std": 0.028164727613329887, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 128.15625, "completions/mean_terminated_length": 128.15625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.017290760581309007, "frac_reward_zero_std": 1.0, "grad_norm": 0.228515625, "kl": 0.029513808840420097, "learning_rate": 7.9352e-06, "loss": 0.0012, "num_tokens": 7441220.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 274.4375, "completions/mean_terminated_length": 274.4375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.01739683886708391, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.03132422378985211, "learning_rate": 7.934799999999999e-06, "loss": 0.0013, "num_tokens": 7485106.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 273.1612854003906, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.01750291715285881, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0340282313991338, "learning_rate": 7.9344e-06, "loss": 0.0014, "num_tokens": 7507526.0, "reward": 3.6591928005218506, "reward_std": 0.8599532246589661, "rewards/reward_fn/mean": 3.6591928005218506, "rewards/reward_fn/std": 0.8599532842636108, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 228.84375, "completions/mean_terminated_length": 228.84375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.01760899543863371, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.061049588199239224, "learning_rate": 7.934e-06, "loss": 0.0024, "num_tokens": 7561217.0, "reward": 3.0768370628356934, "reward_std": 0.06554757058620453, "rewards/reward_fn/mean": 3.0768370628356934, "rewards/reward_fn/std": 0.06554758548736572, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 73.96875, "completions/mean_terminated_length": 73.96875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.017715073724408614, "frac_reward_zero_std": 1.0, "grad_norm": 0.232421875, "kl": 0.03734696819446981, "learning_rate": 7.9336e-06, "loss": 0.0015, "num_tokens": 7605536.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.017821152010183517, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.05416923708980903, "learning_rate": 7.9332e-06, "loss": 0.0022, "num_tokens": 7651144.0, "reward": 2.543686628341675, "reward_std": 0.5045425891876221, "rewards/reward_fn/mean": 2.543686628341675, "rewards/reward_fn/std": 0.5045425891876221, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.017927230295958416, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.03905608196510002, "learning_rate": 7.9328e-06, "loss": 0.0016, "num_tokens": 7701384.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1153.0, "completions/max_terminated_length": 1153.0, "completions/mean_length": 426.53125, "completions/mean_terminated_length": 426.53125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.01803330858173332, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.04825877823168412, "learning_rate": 7.9324e-06, "loss": 0.0019, "num_tokens": 7751033.0, "reward": 2.7124316692352295, "reward_std": 0.5163800120353699, "rewards/reward_fn/mean": 2.7124316692352295, "rewards/reward_fn/std": 0.5163800716400146, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 180.21875, "completions/mean_terminated_length": 180.21875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.018139386867508222, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.04462099994998425, "learning_rate": 7.932e-06, "loss": 0.0018, "num_tokens": 7798016.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 116.46875, "completions/mean_terminated_length": 116.46875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.01824546515328312, "frac_reward_zero_std": 1.0, "grad_norm": 0.173828125, "kl": 0.04711482278071344, "learning_rate": 7.9316e-06, "loss": 0.0019, "num_tokens": 7833263.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 209.84375, "completions/mean_terminated_length": 209.84375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.018351543439058024, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.05251658370252699, "learning_rate": 7.9312e-06, "loss": 0.0021, "num_tokens": 7879786.0, "reward": 3.908167839050293, "reward_std": 0.2901296019554138, "rewards/reward_fn/mean": 3.908167839050293, "rewards/reward_fn/std": 0.2901296317577362, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 197.90625, "completions/mean_terminated_length": 197.90625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.018457621724832927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.019432318513281643, "learning_rate": 7.930799999999999e-06, "loss": 0.0008, "num_tokens": 7924391.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 130.78125, "completions/mean_terminated_length": 130.78125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.01856370001060783, "frac_reward_zero_std": 0.0, "grad_norm": 3.5, "kl": 0.030577484285458922, "learning_rate": 7.9304e-06, "loss": 0.0012, "num_tokens": 7962624.0, "reward": 3.843151092529297, "reward_std": 0.27601996064186096, "rewards/reward_fn/mean": 3.843151092529297, "rewards/reward_fn/std": 0.2760199308395386, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 104.78125, "completions/mean_terminated_length": 104.78125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.01866977829638273, "frac_reward_zero_std": 1.0, "grad_norm": 0.240234375, "kl": 0.052696100203320384, "learning_rate": 7.929999999999999e-06, "loss": 0.0021, "num_tokens": 7989881.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.01877585658215763, "frac_reward_zero_std": 1.0, "grad_norm": 0.2255859375, "kl": 0.04239236278226599, "learning_rate": 7.9296e-06, "loss": 0.0017, "num_tokens": 8023833.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 165.8125, "completions/mean_terminated_length": 165.8125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.018881934867932534, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.029231760883703828, "learning_rate": 7.9292e-06, "loss": 0.0012, "num_tokens": 8062675.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.018988013153707437, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.039269523753318936, "learning_rate": 7.9288e-06, "loss": 0.0016, "num_tokens": 8108643.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 119.34375, "completions/mean_terminated_length": 119.34375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.019094091439482336, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.03951008943840861, "learning_rate": 7.9284e-06, "loss": 0.0016, "num_tokens": 8143758.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 166.84375, "completions/mean_terminated_length": 166.84375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.01920016972525724, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.02374349971069023, "learning_rate": 7.928e-06, "loss": 0.001, "num_tokens": 8189033.0, "reward": 3.7921862602233887, "reward_std": 0.26086750626564026, "rewards/reward_fn/mean": 3.7921862602233887, "rewards/reward_fn/std": 0.26086747646331787, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 136.59375, "completions/mean_terminated_length": 136.59375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.019306248011032142, "frac_reward_zero_std": 1.0, "grad_norm": 0.1884765625, "kl": 0.04318763309856877, "learning_rate": 7.9276e-06, "loss": 0.0017, "num_tokens": 8215548.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 258.0625, "completions/mean_terminated_length": 258.0625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.019412326296807045, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.05549651384353638, "learning_rate": 7.9272e-06, "loss": 0.0022, "num_tokens": 8261726.0, "reward": 2.7983384132385254, "reward_std": 0.01264275424182415, "rewards/reward_fn/mean": 2.7983384132385254, "rewards/reward_fn/std": 0.01264276821166277, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 280.8125, "completions/mean_terminated_length": 280.8125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.019518404582581944, "frac_reward_zero_std": 0.0, "grad_norm": 3.40625, "kl": 0.03899583051679656, "learning_rate": 7.9268e-06, "loss": 0.0016, "num_tokens": 8311064.0, "reward": 3.5739221572875977, "reward_std": 0.4618890881538391, "rewards/reward_fn/mean": 3.5739221572875977, "rewards/reward_fn/std": 0.46188902854919434, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 246.78125, "completions/mean_terminated_length": 246.78125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.019624482868356847, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.04396548826480284, "learning_rate": 7.9264e-06, "loss": 0.0018, "num_tokens": 8348273.0, "reward": 2.785221576690674, "reward_std": 0.012262105010449886, "rewards/reward_fn/mean": 2.785221576690674, "rewards/reward_fn/std": 0.012262105941772461, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 591.90625, "completions/mean_terminated_length": 494.8333740234375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.01973056115413175, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.028339633194264024, "learning_rate": 7.926e-06, "loss": 0.0011, "num_tokens": 8414830.0, "reward": 2.7476210594177246, "reward_std": 0.7238368391990662, "rewards/reward_fn/mean": 2.7476210594177246, "rewards/reward_fn/std": 0.7238367795944214, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 458.65625, "completions/mean_terminated_length": 458.65625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.019836639439906652, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.03971007274230942, "learning_rate": 7.925599999999999e-06, "loss": 0.0016, "num_tokens": 8466883.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 336.09375, "completions/mean_terminated_length": 336.09375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.019942717725681552, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.04593227559234947, "learning_rate": 7.9252e-06, "loss": 0.0018, "num_tokens": 8511878.0, "reward": 3.2010879516601562, "reward_std": 0.09355498105287552, "rewards/reward_fn/mean": 3.2010879516601562, "rewards/reward_fn/std": 0.09355500340461731, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 444.09375, "completions/mean_terminated_length": 444.09375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.020048796011456455, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.034340374055318534, "learning_rate": 7.9248e-06, "loss": 0.0014, "num_tokens": 8548201.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 563.90625, "completions/mean_terminated_length": 563.90625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.020154874297231357, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.037472407915629447, "learning_rate": 7.9244e-06, "loss": 0.0015, "num_tokens": 8601350.0, "reward": 2.781393527984619, "reward_std": 0.017921049147844315, "rewards/reward_fn/mean": 2.781393527984619, "rewards/reward_fn/std": 0.017921047285199165, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1138.15625, "completions/mean_terminated_length": 1044.034423828125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.02026095258300626, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.027679344464559108, "learning_rate": 7.924e-06, "loss": 0.0011, "num_tokens": 8675243.0, "reward": 3.1059072017669678, "reward_std": 1.2772547006607056, "rewards/reward_fn/mean": 3.1059072017669678, "rewards/reward_fn/std": 1.2772547006607056, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 160.3125, "completions/mean_terminated_length": 160.3125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.02036703086878116, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.02794185228412971, "learning_rate": 7.9236e-06, "loss": 0.0011, "num_tokens": 8721653.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 102.46875, "completions/mean_terminated_length": 102.46875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.020473109154556062, "frac_reward_zero_std": 1.0, "grad_norm": 0.158203125, "kl": 0.041104476957116276, "learning_rate": 7.923199999999999e-06, "loss": 0.0016, "num_tokens": 8746500.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 104.5, "completions/mean_terminated_length": 104.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.020579187440330965, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923828125, "kl": 0.03695642208913341, "learning_rate": 7.9228e-06, "loss": 0.0015, "num_tokens": 8770548.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 74.46875, "completions/mean_terminated_length": 74.46875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.020685265726105868, "frac_reward_zero_std": 0.0, "grad_norm": 5.34375, "kl": 0.008636852275230922, "learning_rate": 7.922399999999999e-06, "loss": 0.0003, "num_tokens": 8807427.0, "reward": 3.072772264480591, "reward_std": 0.03884429484605789, "rewards/reward_fn/mean": 3.072772264480591, "rewards/reward_fn/std": 0.03884435072541237, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 188.84375, "completions/mean_terminated_length": 188.84375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.020791344011880767, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.02465893269982189, "learning_rate": 7.922e-06, "loss": 0.001, "num_tokens": 8833886.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.02089742229765567, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.030734462081454694, "learning_rate": 7.921599999999999e-06, "loss": 0.0012, "num_tokens": 8879938.0, "reward": 2.784573554992676, "reward_std": 0.02381717413663864, "rewards/reward_fn/mean": 2.784573554992676, "rewards/reward_fn/std": 0.02381720580160618, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 86.5625, "completions/mean_terminated_length": 86.5625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.021003500583430573, "frac_reward_zero_std": 1.0, "grad_norm": 0.28515625, "kl": 0.07277392386458814, "learning_rate": 7.9212e-06, "loss": 0.0029, "num_tokens": 8926068.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 194.125, "completions/mean_terminated_length": 194.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.021109578869205475, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.03341373009607196, "learning_rate": 7.920799999999999e-06, "loss": 0.0013, "num_tokens": 8962680.0, "reward": 3.2303833961486816, "reward_std": 0.48945605754852295, "rewards/reward_fn/mean": 3.2303833961486816, "rewards/reward_fn/std": 0.48945602774620056, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 164.84375, "completions/mean_terminated_length": 164.84375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.021215657154980375, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.04750790970865637, "learning_rate": 7.9204e-06, "loss": 0.0019, "num_tokens": 9005907.0, "reward": 3.969318151473999, "reward_std": 0.17356248199939728, "rewards/reward_fn/mean": 3.969318151473999, "rewards/reward_fn/std": 0.17356249690055847, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 206.5625, "completions/mean_terminated_length": 206.5625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.021321735440755277, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.038540702196769416, "learning_rate": 7.92e-06, "loss": 0.0015, "num_tokens": 9044805.0, "reward": 3.435892343521118, "reward_std": 0.6499969959259033, "rewards/reward_fn/mean": 3.435892343521118, "rewards/reward_fn/std": 0.6499969959259033, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 309.75, "completions/mean_terminated_length": 309.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.02142781372653018, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.02935608651023358, "learning_rate": 7.9196e-06, "loss": 0.0012, "num_tokens": 9096125.0, "reward": 3.915349006652832, "reward_std": 0.26741769909858704, "rewards/reward_fn/mean": 3.915349006652832, "rewards/reward_fn/std": 0.26741769909858704, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 325.6875, "completions/mean_terminated_length": 325.6875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.02153389201230508, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.03687985229771584, "learning_rate": 7.9192e-06, "loss": 0.0015, "num_tokens": 9147411.0, "reward": 2.7406442165374756, "reward_std": 0.027461480349302292, "rewards/reward_fn/mean": 2.7406442165374756, "rewards/reward_fn/std": 0.027461478486657143, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 221.0625, "completions/mean_terminated_length": 221.0625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.021639970298079982, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.04066753340885043, "learning_rate": 7.9188e-06, "loss": 0.0016, "num_tokens": 9186165.0, "reward": 3.394157886505127, "reward_std": 0.269808292388916, "rewards/reward_fn/mean": 3.394157886505127, "rewards/reward_fn/std": 0.269808292388916, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 239.1875, "completions/mean_terminated_length": 239.1875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.021746048583854885, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005859375, "kl": 0.02556429200922139, "learning_rate": 7.9184e-06, "loss": 0.001, "num_tokens": 9215099.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.021852126869629788, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.038756676425691694, "learning_rate": 7.918e-06, "loss": 0.0016, "num_tokens": 9273627.0, "reward": 3.86188006401062, "reward_std": 0.4663735628128052, "rewards/reward_fn/mean": 3.86188006401062, "rewards/reward_fn/std": 0.4663735628128052, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 214.03125, "completions/mean_terminated_length": 214.03125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.021958205155404687, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.04700932162813842, "learning_rate": 7.9176e-06, "loss": 0.0019, "num_tokens": 9310940.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 681.5625, "completions/mean_terminated_length": 681.5625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.02206428344117959, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.02947085179039277, "learning_rate": 7.9172e-06, "loss": 0.0012, "num_tokens": 9364302.0, "reward": 3.9197487831115723, "reward_std": 0.3157848119735718, "rewards/reward_fn/mean": 3.9197487831115723, "rewards/reward_fn/std": 0.3157848119735718, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.022170361726954493, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.03974838956492022, "learning_rate": 7.9168e-06, "loss": 0.0016, "num_tokens": 9402146.0, "reward": 3.14357852935791, "reward_std": 0.41966521739959717, "rewards/reward_fn/mean": 3.14357852935791, "rewards/reward_fn/std": 0.41966521739959717, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 182.5625, "completions/mean_terminated_length": 182.5625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.022276440012729395, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.049357162322849035, "learning_rate": 7.9164e-06, "loss": 0.002, "num_tokens": 9444852.0, "reward": 3.931201934814453, "reward_std": 0.3891806900501251, "rewards/reward_fn/mean": 3.931201934814453, "rewards/reward_fn/std": 0.3891806900501251, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 88.96875, "completions/mean_terminated_length": 88.96875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.022382518298504295, "frac_reward_zero_std": 1.0, "grad_norm": 0.224609375, "kl": 0.06599157059099525, "learning_rate": 7.916e-06, "loss": 0.0026, "num_tokens": 9485139.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 152.0625, "completions/mean_terminated_length": 152.0625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.022488596584279198, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.0660236410330981, "learning_rate": 7.9156e-06, "loss": 0.0026, "num_tokens": 9525557.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 208.0625, "completions/mean_terminated_length": 208.0625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.0225946748700541, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.059810824575833976, "learning_rate": 7.9152e-06, "loss": 0.0024, "num_tokens": 9568375.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 476.15625, "completions/mean_terminated_length": 476.15625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.022700753155829003, "frac_reward_zero_std": 0.0, "grad_norm": 0.7421875, "kl": 0.028504444286227226, "learning_rate": 7.9148e-06, "loss": 0.0011, "num_tokens": 9618044.0, "reward": 2.8191311359405518, "reward_std": 0.31098735332489014, "rewards/reward_fn/mean": 2.8191311359405518, "rewards/reward_fn/std": 0.31098735332489014, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 227.25, "completions/mean_terminated_length": 227.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.022806831441603902, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.026997349661542103, "learning_rate": 7.9144e-06, "loss": 0.0011, "num_tokens": 9663748.0, "reward": 3.4124526977539062, "reward_std": 0.1661146730184555, "rewards/reward_fn/mean": 3.4124526977539062, "rewards/reward_fn/std": 0.1661146730184555, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 234.375, "completions/mean_terminated_length": 234.375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.022912909727378805, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.04880005633458495, "learning_rate": 7.913999999999999e-06, "loss": 0.002, "num_tokens": 9709584.0, "reward": 3.9680237770080566, "reward_std": 0.18088558316230774, "rewards/reward_fn/mean": 3.9680237770080566, "rewards/reward_fn/std": 0.18088559806346893, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 264.65625, "completions/mean_terminated_length": 264.65625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.023018988013153708, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.051924208470154554, "learning_rate": 7.9136e-06, "loss": 0.0021, "num_tokens": 9753861.0, "reward": 1.794126033782959, "reward_std": 0.02186727151274681, "rewards/reward_fn/mean": 1.794126033782959, "rewards/reward_fn/std": 0.02186727523803711, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 223.8125, "completions/mean_terminated_length": 223.8125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.02312506629892861, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.041035423579160124, "learning_rate": 7.913199999999999e-06, "loss": 0.0016, "num_tokens": 9802303.0, "reward": 2.8291447162628174, "reward_std": 0.024067319929599762, "rewards/reward_fn/mean": 2.8291447162628174, "rewards/reward_fn/std": 0.024067312479019165, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 489.6875, "completions/mean_terminated_length": 489.6875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.02323114458470351, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.03248375124530867, "learning_rate": 7.9128e-06, "loss": 0.0013, "num_tokens": 9868373.0, "reward": 3.9269018173217773, "reward_std": 0.41350504755973816, "rewards/reward_fn/mean": 3.9269018173217773, "rewards/reward_fn/std": 0.4135049879550934, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 269.84375, "completions/mean_terminated_length": 269.84375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.023337222870478413, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.03564566490240395, "learning_rate": 7.912399999999999e-06, "loss": 0.0014, "num_tokens": 9908272.0, "reward": 2.8743367195129395, "reward_std": 0.010945392772555351, "rewards/reward_fn/mean": 2.8743367195129395, "rewards/reward_fn/std": 0.01094542071223259, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 307.9375, "completions/mean_terminated_length": 307.9375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.023443301156253316, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.049371514352969825, "learning_rate": 7.912e-06, "loss": 0.002, "num_tokens": 9955278.0, "reward": 2.8059816360473633, "reward_std": 0.21458326280117035, "rewards/reward_fn/mean": 2.8059816360473633, "rewards/reward_fn/std": 0.21458324790000916, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 198.65625, "completions/mean_terminated_length": 198.65625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.02354937944202822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.04636182641843334, "learning_rate": 7.911599999999999e-06, "loss": 0.0019, "num_tokens": 9993603.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 215.09375, "completions/mean_terminated_length": 215.09375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.023655457727803118, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.037611712352372706, "learning_rate": 7.9112e-06, "loss": 0.0015, "num_tokens": 10035526.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 404.1875, "completions/mean_terminated_length": 404.1875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.02376153601357802, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.031055110739544034, "learning_rate": 7.910799999999999e-06, "loss": 0.0012, "num_tokens": 10083276.0, "reward": 3.8559327125549316, "reward_std": 0.3414000868797302, "rewards/reward_fn/mean": 3.8559327125549316, "rewards/reward_fn/std": 0.34140002727508545, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 178.90625, "completions/mean_terminated_length": 178.90625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.023867614299352923, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.045679540548007935, "learning_rate": 7.9104e-06, "loss": 0.0018, "num_tokens": 10124553.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 390.875, "completions/mean_terminated_length": 390.875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.023973692585127826, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.04144083388382569, "learning_rate": 7.91e-06, "loss": 0.0017, "num_tokens": 10200741.0, "reward": 2.8919546604156494, "reward_std": 0.07341831922531128, "rewards/reward_fn/mean": 2.8919546604156494, "rewards/reward_fn/std": 0.07341834157705307, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.024079770870902725, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.041321132448501885, "learning_rate": 7.9096e-06, "loss": 0.0016, "num_tokens": 10241029.0, "reward": 3.0374879837036133, "reward_std": 0.026088356971740723, "rewards/reward_fn/mean": 3.0374879837036133, "rewards/reward_fn/std": 0.02608831785619259, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 259.5, "completions/mean_terminated_length": 259.5, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.024185849156677628, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.05707435368094593, "learning_rate": 7.9092e-06, "loss": 0.0023, "num_tokens": 10278837.0, "reward": 2.99910569190979, "reward_std": 0.06235523894429207, "rewards/reward_fn/mean": 2.99910569190979, "rewards/reward_fn/std": 0.062355220317840576, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 147.90625, "completions/mean_terminated_length": 147.90625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.02429192744245253, "frac_reward_zero_std": 1.0, "grad_norm": 0.1123046875, "kl": 0.055055871489457786, "learning_rate": 7.9088e-06, "loss": 0.0022, "num_tokens": 10311794.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 105.6875, "completions/mean_terminated_length": 105.6875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.02439800572822743, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.06182589090894908, "learning_rate": 7.9084e-06, "loss": 0.0025, "num_tokens": 10361128.0, "reward": 3.989091396331787, "reward_std": 0.061708446592092514, "rewards/reward_fn/mean": 3.989091396331787, "rewards/reward_fn/std": 0.061708465218544006, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 89.34375, "completions/mean_terminated_length": 89.34375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.024504084014002333, "frac_reward_zero_std": 1.0, "grad_norm": 0.35546875, "kl": 0.0609287271508947, "learning_rate": 7.908e-06, "loss": 0.0024, "num_tokens": 10404307.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 374.75, "completions/mean_terminated_length": 374.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.024610162299777236, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.045864680025260895, "learning_rate": 7.9076e-06, "loss": 0.0018, "num_tokens": 10432651.0, "reward": 2.8449668884277344, "reward_std": 0.02234821394085884, "rewards/reward_fn/mean": 2.8449668884277344, "rewards/reward_fn/std": 0.022348226979374886, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.02471624058555214, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.0719969633501023, "learning_rate": 7.9072e-06, "loss": 0.0029, "num_tokens": 10478219.0, "reward": 3.701000690460205, "reward_std": 0.48612549901008606, "rewards/reward_fn/mean": 3.701000690460205, "rewards/reward_fn/std": 0.4861254394054413, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 95.28125, "completions/mean_terminated_length": 95.28125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.024822318871327038, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.0665056451689452, "learning_rate": 7.906799999999999e-06, "loss": 0.0027, "num_tokens": 10514356.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 187.0625, "completions/mean_terminated_length": 187.0625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.02492839715710194, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.05317691981326789, "learning_rate": 7.9064e-06, "loss": 0.0021, "num_tokens": 10558326.0, "reward": 3.9692771434783936, "reward_std": 0.1737947314977646, "rewards/reward_fn/mean": 3.9692771434783936, "rewards/reward_fn/std": 0.17379476130008698, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 196.53125, "completions/mean_terminated_length": 196.53125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.025034475442876843, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.046237326750997454, "learning_rate": 7.905999999999999e-06, "loss": 0.0018, "num_tokens": 10611559.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 143.09375, "completions/mean_terminated_length": 143.09375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.025140553728651746, "frac_reward_zero_std": 1.0, "grad_norm": 0.193359375, "kl": 0.07731026317924261, "learning_rate": 7.9056e-06, "loss": 0.0031, "num_tokens": 10636874.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 153.9375, "completions/mean_terminated_length": 153.9375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.025246632014426645, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.0432970619876869, "learning_rate": 7.9052e-06, "loss": 0.0017, "num_tokens": 10695240.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 473.0625, "completions/mean_terminated_length": 368.0666809082031, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.025352710300201548, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.04987858441018034, "learning_rate": 7.9048e-06, "loss": 0.002, "num_tokens": 10725962.0, "reward": 1.7599223852157593, "reward_std": 0.5935760736465454, "rewards/reward_fn/mean": 1.7599223852157593, "rewards/reward_fn/std": 0.5935760140419006, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 563.125, "completions/mean_terminated_length": 563.125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.02545878858597645, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.038596726139076054, "learning_rate": 7.9044e-06, "loss": 0.0015, "num_tokens": 10788334.0, "reward": 2.9869730472564697, "reward_std": 0.33395835757255554, "rewards/reward_fn/mean": 2.9869730472564697, "rewards/reward_fn/std": 0.33395832777023315, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 202.4375, "completions/mean_terminated_length": 202.4375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.025564866871751354, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.06359338853508234, "learning_rate": 7.904e-06, "loss": 0.0025, "num_tokens": 10833500.0, "reward": 2.872314214706421, "reward_std": 0.05506020411849022, "rewards/reward_fn/mean": 2.872314214706421, "rewards/reward_fn/std": 0.055060189217329025, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 181.0625, "completions/mean_terminated_length": 181.0625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.025670945157526253, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.05014959839172661, "learning_rate": 7.9036e-06, "loss": 0.002, "num_tokens": 10874910.0, "reward": 2.904085636138916, "reward_std": 0.03150341659784317, "rewards/reward_fn/mean": 2.904085636138916, "rewards/reward_fn/std": 0.03150341659784317, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 186.46875, "completions/mean_terminated_length": 186.46875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.025777023443301156, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.04699235921725631, "learning_rate": 7.903199999999999e-06, "loss": 0.0019, "num_tokens": 10896013.0, "reward": 3.967304229736328, "reward_std": 0.18495480716228485, "rewards/reward_fn/mean": 3.967304229736328, "rewards/reward_fn/std": 0.18495479226112366, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 172.1875, "completions/mean_terminated_length": 172.1875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.02588310172907606, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.06149169441778213, "learning_rate": 7.9028e-06, "loss": 0.0025, "num_tokens": 10939251.0, "reward": 3.968230724334717, "reward_std": 0.17971432209014893, "rewards/reward_fn/mean": 3.968230724334717, "rewards/reward_fn/std": 0.17971433699131012, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 95.28125, "completions/mean_terminated_length": 95.28125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.02598918001485096, "frac_reward_zero_std": 1.0, "grad_norm": 0.181640625, "kl": 0.07846818270627409, "learning_rate": 7.902399999999999e-06, "loss": 0.0031, "num_tokens": 10971612.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 272.5625, "completions/mean_terminated_length": 272.5625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.02609525830062586, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.06347244174685329, "learning_rate": 7.902e-06, "loss": 0.0025, "num_tokens": 11019918.0, "reward": 2.806910991668701, "reward_std": 0.19936612248420715, "rewards/reward_fn/mean": 2.806910991668701, "rewards/reward_fn/std": 0.19936615228652954, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 230.21875, "completions/mean_terminated_length": 230.21875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.026201336586400763, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.06723654980305582, "learning_rate": 7.901599999999999e-06, "loss": 0.0027, "num_tokens": 11062581.0, "reward": 3.2223823070526123, "reward_std": 0.4942571222782135, "rewards/reward_fn/mean": 3.2223823070526123, "rewards/reward_fn/std": 0.4942571222782135, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 92.15625, "completions/mean_terminated_length": 92.15625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.026307414872175666, "frac_reward_zero_std": 1.0, "grad_norm": 0.2041015625, "kl": 0.08666167384944856, "learning_rate": 7.9012e-06, "loss": 0.0035, "num_tokens": 11082490.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 254.59375, "completions/mean_terminated_length": 254.59375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.02641349315795057, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.047324330371338874, "learning_rate": 7.9008e-06, "loss": 0.0019, "num_tokens": 11123149.0, "reward": 2.9275622367858887, "reward_std": 0.01993408240377903, "rewards/reward_fn/mean": 2.9275622367858887, "rewards/reward_fn/std": 0.019934087991714478, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 388.625, "completions/mean_terminated_length": 388.625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.02651957144372547, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.03459140588529408, "learning_rate": 7.9004e-06, "loss": 0.0014, "num_tokens": 11172769.0, "reward": 2.702910900115967, "reward_std": 0.029907824471592903, "rewards/reward_fn/mean": 2.702910900115967, "rewards/reward_fn/std": 0.0299078281968832, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.02662564972950037, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.04598871496273205, "learning_rate": 7.9e-06, "loss": 0.0018, "num_tokens": 11196861.0, "reward": 3.9364213943481445, "reward_std": 0.25018107891082764, "rewards/reward_fn/mean": 3.9364213943481445, "rewards/reward_fn/std": 0.25018110871315, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 186.8125, "completions/mean_terminated_length": 186.8125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.026731728015275274, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.06444371730322018, "learning_rate": 7.8996e-06, "loss": 0.0026, "num_tokens": 11243735.0, "reward": 3.9610886573791504, "reward_std": 0.220115527510643, "rewards/reward_fn/mean": 3.9610886573791504, "rewards/reward_fn/std": 0.2201155424118042, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 223.28125, "completions/mean_terminated_length": 223.28125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.026837806301050177, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.04907032381743193, "learning_rate": 7.8992e-06, "loss": 0.002, "num_tokens": 11287904.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 420.6875, "completions/mean_terminated_length": 420.6875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.026943884586825076, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.048126887530088425, "learning_rate": 7.8988e-06, "loss": 0.0019, "num_tokens": 11333430.0, "reward": 2.9162116050720215, "reward_std": 0.41033974289894104, "rewards/reward_fn/mean": 2.9162116050720215, "rewards/reward_fn/std": 0.41033968329429626, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 149.65625, "completions/mean_terminated_length": 149.65625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.02704996287259998, "frac_reward_zero_std": 1.0, "grad_norm": 0.240234375, "kl": 0.08926701568998396, "learning_rate": 7.898399999999999e-06, "loss": 0.0036, "num_tokens": 11368939.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 227.21875, "completions/mean_terminated_length": 227.21875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.02715604115837488, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.05576099781319499, "learning_rate": 7.898e-06, "loss": 0.0022, "num_tokens": 11424722.0, "reward": 3.6330835819244385, "reward_std": 0.7868334054946899, "rewards/reward_fn/mean": 3.6330835819244385, "rewards/reward_fn/std": 0.7868334054946899, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 113.71875, "completions/mean_terminated_length": 113.71875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.027262119444149784, "frac_reward_zero_std": 0.0, "grad_norm": 4.0625, "kl": 0.09824320953339338, "learning_rate": 7.897599999999999e-06, "loss": 0.0039, "num_tokens": 11456265.0, "reward": 3.084028720855713, "reward_std": 0.049738768488168716, "rewards/reward_fn/mean": 3.084028720855713, "rewards/reward_fn/std": 0.04973877593874931, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 204.78125, "completions/mean_terminated_length": 204.78125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.027368197729924684, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.06405076989904046, "learning_rate": 7.8972e-06, "loss": 0.0026, "num_tokens": 11495842.0, "reward": 3.972503185272217, "reward_std": 0.15554513037204742, "rewards/reward_fn/mean": 3.972503185272217, "rewards/reward_fn/std": 0.15554508566856384, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 165.0625, "completions/mean_terminated_length": 165.0625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.027474276015699586, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.058812946430407465, "learning_rate": 7.896799999999999e-06, "loss": 0.0024, "num_tokens": 11531428.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 192.34375, "completions/mean_terminated_length": 192.34375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.02758035430147449, "frac_reward_zero_std": 1.0, "grad_norm": 0.3671875, "kl": 0.07722758501768112, "learning_rate": 7.8964e-06, "loss": 0.0031, "num_tokens": 11570095.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 174.71875, "completions/mean_terminated_length": 174.71875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.02768643258724939, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.06501786492299289, "learning_rate": 7.896e-06, "loss": 0.0026, "num_tokens": 11612422.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 163.75, "completions/mean_terminated_length": 163.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.02779251087302429, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.08121591794770211, "learning_rate": 7.8956e-06, "loss": 0.0033, "num_tokens": 11647198.0, "reward": 2.8717610836029053, "reward_std": 0.009377574548125267, "rewards/reward_fn/mean": 2.8717610836029053, "rewards/reward_fn/std": 0.009377571754157543, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 192.53125, "completions/mean_terminated_length": 192.53125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.027898589158799194, "frac_reward_zero_std": 1.0, "grad_norm": 0.2314453125, "kl": 0.05851406557485461, "learning_rate": 7.8952e-06, "loss": 0.0023, "num_tokens": 11691119.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 303.59375, "completions/mean_terminated_length": 303.59375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.028004667444574097, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.060931759886443615, "learning_rate": 7.8948e-06, "loss": 0.0024, "num_tokens": 11730018.0, "reward": 2.826643466949463, "reward_std": 0.01925027370452881, "rewards/reward_fn/mean": 2.826643466949463, "rewards/reward_fn/std": 0.019250305369496346, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.028110745730348996, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.05281880928669125, "learning_rate": 7.8944e-06, "loss": 0.0021, "num_tokens": 11773342.0, "reward": 2.93717885017395, "reward_std": 0.022945858538150787, "rewards/reward_fn/mean": 2.93717885017395, "rewards/reward_fn/std": 0.02294587530195713, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 98.5, "completions/mean_terminated_length": 98.5, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.0282168240161239, "frac_reward_zero_std": 1.0, "grad_norm": 0.251953125, "kl": 0.12909941375255585, "learning_rate": 7.894e-06, "loss": 0.0052, "num_tokens": 11814926.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 176.9375, "completions/mean_terminated_length": 176.9375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0283229023018988, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.05416906625032425, "learning_rate": 7.8936e-06, "loss": 0.0022, "num_tokens": 11853292.0, "reward": 1.8326301574707031, "reward_std": 0.24207736551761627, "rewards/reward_fn/mean": 1.8326301574707031, "rewards/reward_fn/std": 0.24207736551761627, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.028428980587673704, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.04807147482642904, "learning_rate": 7.8932e-06, "loss": 0.0019, "num_tokens": 11891940.0, "reward": 3.042628526687622, "reward_std": 0.03994278982281685, "rewards/reward_fn/mean": 3.042628526687622, "rewards/reward_fn/std": 0.03994282707571983, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.028535058873448604, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.07457437110133469, "learning_rate": 7.8928e-06, "loss": 0.003, "num_tokens": 11967852.0, "reward": 3.9629762172698975, "reward_std": 0.2094382643699646, "rewards/reward_fn/mean": 3.9629762172698975, "rewards/reward_fn/std": 0.2094382643699646, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 144.03125, "completions/mean_terminated_length": 144.03125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.028641137159223506, "frac_reward_zero_std": 1.0, "grad_norm": 0.16015625, "kl": 0.04853607731638476, "learning_rate": 7.8924e-06, "loss": 0.0019, "num_tokens": 12022797.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 87.28125, "completions/mean_terminated_length": 87.28125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.02874721544499841, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.07282675942406058, "learning_rate": 7.892e-06, "loss": 0.0029, "num_tokens": 12060790.0, "reward": 3.9492380619049072, "reward_std": 0.13654832541942596, "rewards/reward_fn/mean": 3.9492380619049072, "rewards/reward_fn/std": 0.1365482658147812, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 136.15625, "completions/mean_terminated_length": 136.15625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.028853293730773312, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.09152523137163371, "learning_rate": 7.8916e-06, "loss": 0.0037, "num_tokens": 12106555.0, "reward": 3.120361804962158, "reward_std": 0.32416626811027527, "rewards/reward_fn/mean": 3.120361804962158, "rewards/reward_fn/std": 0.32416629791259766, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 144.0625, "completions/mean_terminated_length": 144.0625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.02895937201654821, "frac_reward_zero_std": 0.0, "grad_norm": 3.28125, "kl": 0.05412678746506572, "learning_rate": 7.8912e-06, "loss": 0.0022, "num_tokens": 12142813.0, "reward": 2.741058111190796, "reward_std": 0.016797101125121117, "rewards/reward_fn/mean": 2.741058111190796, "rewards/reward_fn/std": 0.016797110438346863, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 187.96875, "completions/mean_terminated_length": 187.96875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.029065450302323114, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.04939495923463255, "learning_rate": 7.890799999999999e-06, "loss": 0.002, "num_tokens": 12176988.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 304.28125, "completions/mean_terminated_length": 304.28125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.029171528588098017, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.06299524009227753, "learning_rate": 7.8904e-06, "loss": 0.0025, "num_tokens": 12221829.0, "reward": 2.059215545654297, "reward_std": 0.4553739130496979, "rewards/reward_fn/mean": 2.059215545654297, "rewards/reward_fn/std": 0.4553739130496979, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 76.28125, "completions/mean_terminated_length": 76.28125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.02927760687387292, "frac_reward_zero_std": 1.0, "grad_norm": 0.197265625, "kl": 0.031393807439599186, "learning_rate": 7.889999999999999e-06, "loss": 0.0013, "num_tokens": 12261806.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.02938368515964782, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.034186359436716884, "learning_rate": 7.8896e-06, "loss": 0.0014, "num_tokens": 12309018.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 104.46875, "completions/mean_terminated_length": 104.46875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.02948976344542272, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.042672890005633235, "learning_rate": 7.889199999999999e-06, "loss": 0.0017, "num_tokens": 12343017.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 128.78125, "completions/mean_terminated_length": 128.78125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.029595841731197624, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.06968200847040862, "learning_rate": 7.8888e-06, "loss": 0.0028, "num_tokens": 12375650.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 320.46875, "completions/mean_terminated_length": 320.46875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.029701920016972527, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.06288569653406739, "learning_rate": 7.888399999999999e-06, "loss": 0.0025, "num_tokens": 12437777.0, "reward": 3.931251049041748, "reward_std": 0.27083924412727356, "rewards/reward_fn/mean": 3.931251049041748, "rewards/reward_fn/std": 0.27083921432495117, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 115.71875, "completions/mean_terminated_length": 115.71875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.029807998302747427, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.06481995538342744, "learning_rate": 7.888e-06, "loss": 0.0026, "num_tokens": 12473992.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 166.78125, "completions/mean_terminated_length": 166.78125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.02991407658852233, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.05803050857502967, "learning_rate": 7.887599999999999e-06, "loss": 0.0023, "num_tokens": 12496545.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 531.6875, "completions/mean_terminated_length": 531.6875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.030020154874297232, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.06650032801553607, "learning_rate": 7.8872e-06, "loss": 0.0027, "num_tokens": 12548439.0, "reward": 3.1316885948181152, "reward_std": 0.381794273853302, "rewards/reward_fn/mean": 3.1316885948181152, "rewards/reward_fn/std": 0.3817942142486572, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 257.53125, "completions/mean_terminated_length": 257.53125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.030126233160072135, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.05967911321204156, "learning_rate": 7.8868e-06, "loss": 0.0024, "num_tokens": 12591048.0, "reward": 2.9689855575561523, "reward_std": 0.0832633227109909, "rewards/reward_fn/mean": 2.9689855575561523, "rewards/reward_fn/std": 0.0832633450627327, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 83.1875, "completions/mean_terminated_length": 83.1875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.030232311445847034, "frac_reward_zero_std": 0.0, "grad_norm": 3.828125, "kl": 0.044598213862627745, "learning_rate": 7.8864e-06, "loss": 0.0018, "num_tokens": 12621134.0, "reward": 3.341036319732666, "reward_std": 0.005204085260629654, "rewards/reward_fn/mean": 3.341036319732666, "rewards/reward_fn/std": 0.0052041225135326385, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 185.03125, "completions/mean_terminated_length": 185.03125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.030338389731621937, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.05357580480631441, "learning_rate": 7.886e-06, "loss": 0.0021, "num_tokens": 12655343.0, "reward": 3.5632970333099365, "reward_std": 0.5730718970298767, "rewards/reward_fn/mean": 3.5632970333099365, "rewards/reward_fn/std": 0.5730718970298767, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 281.59375, "completions/mean_terminated_length": 281.59375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.03044446801739684, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.05916093214182183, "learning_rate": 7.8856e-06, "loss": 0.0024, "num_tokens": 12701762.0, "reward": 2.9577698707580566, "reward_std": 0.07138428837060928, "rewards/reward_fn/mean": 2.9577698707580566, "rewards/reward_fn/std": 0.07138428092002869, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 355.40625, "completions/mean_terminated_length": 355.40625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.03055054630317174, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.0407480897847563, "learning_rate": 7.8852e-06, "loss": 0.0016, "num_tokens": 12757071.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 425.28125, "completions/mean_terminated_length": 425.28125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.030656624588946642, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.03841312974691391, "learning_rate": 7.8848e-06, "loss": 0.0015, "num_tokens": 12805368.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 426.6875, "completions/mean_terminated_length": 426.6875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.030762702874721545, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.06251498265191913, "learning_rate": 7.8844e-06, "loss": 0.0025, "num_tokens": 12865678.0, "reward": 2.680443525314331, "reward_std": 0.1847320795059204, "rewards/reward_fn/mean": 2.680443525314331, "rewards/reward_fn/std": 0.18473204970359802, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 319.84375, "completions/mean_terminated_length": 319.84375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.030868781160496447, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.039175089448690414, "learning_rate": 7.884e-06, "loss": 0.0016, "num_tokens": 12890473.0, "reward": 3.614109992980957, "reward_std": 0.6846138834953308, "rewards/reward_fn/mean": 3.614109992980957, "rewards/reward_fn/std": 0.6846139430999756, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 346.28125, "completions/mean_terminated_length": 346.28125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.030974859446271347, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.05797181086381897, "learning_rate": 7.8836e-06, "loss": 0.0023, "num_tokens": 12932338.0, "reward": 3.0171799659729004, "reward_std": 0.024844994768500328, "rewards/reward_fn/mean": 3.0171799659729004, "rewards/reward_fn/std": 0.024844978004693985, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 307.1875, "completions/mean_terminated_length": 307.1875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.03108093773204625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.05411390570225194, "learning_rate": 7.8832e-06, "loss": 0.0022, "num_tokens": 12981112.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 172.84375, "completions/mean_terminated_length": 172.84375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.031187016017821152, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.04656997602432966, "learning_rate": 7.882799999999998e-06, "loss": 0.0019, "num_tokens": 13030483.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 143.28125, "completions/mean_terminated_length": 143.28125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.03129309430359605, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.09696014143992215, "learning_rate": 7.8824e-06, "loss": 0.0039, "num_tokens": 13072348.0, "reward": 3.930624008178711, "reward_std": 0.27306249737739563, "rewards/reward_fn/mean": 3.930624008178711, "rewards/reward_fn/std": 0.273062527179718, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 91.84375, "completions/mean_terminated_length": 91.84375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.03139917258937096, "frac_reward_zero_std": 1.0, "grad_norm": 0.1904296875, "kl": 0.05550631647929549, "learning_rate": 7.882e-06, "loss": 0.0022, "num_tokens": 13092599.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.03150525087514586, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.06322728469967842, "learning_rate": 7.8816e-06, "loss": 0.0025, "num_tokens": 13138255.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 209.21875, "completions/mean_terminated_length": 209.21875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.031611329160920756, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.04859055450651795, "learning_rate": 7.8812e-06, "loss": 0.0019, "num_tokens": 13170902.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 941.46875, "completions/mean_terminated_length": 905.774169921875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.03171740744669566, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.02767645107815042, "learning_rate": 7.880799999999999e-06, "loss": 0.0011, "num_tokens": 13251205.0, "reward": 2.9043970108032227, "reward_std": 0.6032812595367432, "rewards/reward_fn/mean": 2.9043970108032227, "rewards/reward_fn/std": 0.6032813191413879, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 227.0625, "completions/mean_terminated_length": 227.0625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.03182348573247056, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.08030975563451648, "learning_rate": 7.8804e-06, "loss": 0.0032, "num_tokens": 13288967.0, "reward": 3.018399715423584, "reward_std": 0.02766135148704052, "rewards/reward_fn/mean": 3.018399715423584, "rewards/reward_fn/std": 0.027661342173814774, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 628.0, "completions/mean_terminated_length": 628.0, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.03192956401824547, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.04724831320345402, "learning_rate": 7.879999999999999e-06, "loss": 0.0019, "num_tokens": 13358855.0, "reward": 2.7298004627227783, "reward_std": 0.19260305166244507, "rewards/reward_fn/mean": 2.7298004627227783, "rewards/reward_fn/std": 0.19260311126708984, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.03203564230402037, "frac_reward_zero_std": 1.0, "grad_norm": 0.11572265625, "kl": 0.07541715656407177, "learning_rate": 7.8796e-06, "loss": 0.003, "num_tokens": 13418479.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 202.59375, "completions/mean_terminated_length": 202.59375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.03214172058979527, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.06993928935844451, "learning_rate": 7.879199999999999e-06, "loss": 0.0028, "num_tokens": 13465890.0, "reward": 3.0506973266601562, "reward_std": 0.8426318764686584, "rewards/reward_fn/mean": 3.0506973266601562, "rewards/reward_fn/std": 0.8426318764686584, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 225.71875, "completions/mean_terminated_length": 225.71875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.03224779887557017, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.06964913755655289, "learning_rate": 7.8788e-06, "loss": 0.0028, "num_tokens": 13512537.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 281.84375, "completions/mean_terminated_length": 281.84375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.03235387716134507, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.10228216869290918, "learning_rate": 7.878399999999999e-06, "loss": 0.0041, "num_tokens": 13559316.0, "reward": 3.088973045349121, "reward_std": 0.3510621190071106, "rewards/reward_fn/mean": 3.088973045349121, "rewards/reward_fn/std": 0.3510621190071106, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 93.90625, "completions/mean_terminated_length": 93.90625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.03245995544711997, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689453125, "kl": 0.03925357403932139, "learning_rate": 7.878e-06, "loss": 0.0016, "num_tokens": 13574705.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 164.40625, "completions/mean_terminated_length": 164.40625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.03256603373289488, "frac_reward_zero_std": 1.0, "grad_norm": 0.1513671875, "kl": 0.09308067942038178, "learning_rate": 7.8776e-06, "loss": 0.0037, "num_tokens": 13617726.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 242.59375, "completions/mean_terminated_length": 242.59375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.03267211201866978, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.04490488808369264, "learning_rate": 7.8772e-06, "loss": 0.0018, "num_tokens": 13661489.0, "reward": 2.939380645751953, "reward_std": 0.02233259007334709, "rewards/reward_fn/mean": 2.939380645751953, "rewards/reward_fn/std": 0.02233261801302433, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 168.0625, "completions/mean_terminated_length": 168.0625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.03277819030444468, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.07158257730770856, "learning_rate": 7.8768e-06, "loss": 0.0029, "num_tokens": 13700787.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 201.03125, "completions/mean_terminated_length": 201.03125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.03288426859021958, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.06688284839037806, "learning_rate": 7.8764e-06, "loss": 0.0027, "num_tokens": 13742836.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 259.875, "completions/mean_terminated_length": 259.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.03299034687599448, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.04511914274189621, "learning_rate": 7.876e-06, "loss": 0.0018, "num_tokens": 13783056.0, "reward": 3.962053060531616, "reward_std": 0.21466024219989777, "rewards/reward_fn/mean": 3.962053060531616, "rewards/reward_fn/std": 0.21466021239757538, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 97.15625, "completions/mean_terminated_length": 97.15625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.03309642516176939, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.04876602778676897, "learning_rate": 7.8756e-06, "loss": 0.002, "num_tokens": 13826933.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 251.625, "completions/mean_terminated_length": 251.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.03320250344754429, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.08308691228739917, "learning_rate": 7.8752e-06, "loss": 0.0033, "num_tokens": 13895529.0, "reward": 3.067251682281494, "reward_std": 0.03814023733139038, "rewards/reward_fn/mean": 3.067251682281494, "rewards/reward_fn/std": 0.03814024478197098, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 228.34375, "completions/mean_terminated_length": 228.34375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.03330858173331919, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.10598539758939296, "learning_rate": 7.8748e-06, "loss": 0.0042, "num_tokens": 13941396.0, "reward": 3.958070755004883, "reward_std": 0.2371879369020462, "rewards/reward_fn/mean": 3.958070755004883, "rewards/reward_fn/std": 0.237187922000885, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 335.8125, "completions/mean_terminated_length": 335.8125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.03341466001909409, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.04709806991741061, "learning_rate": 7.874399999999999e-06, "loss": 0.0019, "num_tokens": 13964750.0, "reward": 3.926464080810547, "reward_std": 0.41598135232925415, "rewards/reward_fn/mean": 3.926464080810547, "rewards/reward_fn/std": 0.41598138213157654, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 684.34375, "completions/mean_terminated_length": 489.5357360839844, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.03352073830486899, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.04148575963336043, "learning_rate": 7.874e-06, "loss": 0.0017, "num_tokens": 14021081.0, "reward": 2.240612030029297, "reward_std": 1.1568071842193604, "rewards/reward_fn/mean": 2.240612030029297, "rewards/reward_fn/std": 1.1568071842193604, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 333.78125, "completions/mean_terminated_length": 333.78125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.03362681659064389, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.05524576886091381, "learning_rate": 7.873599999999999e-06, "loss": 0.0022, "num_tokens": 14066962.0, "reward": 3.1448464393615723, "reward_std": 0.28363174200057983, "rewards/reward_fn/mean": 3.1448464393615723, "rewards/reward_fn/std": 0.2836317718029022, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 353.1875, "completions/mean_terminated_length": 353.1875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.0337328948764188, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.06478465755935758, "learning_rate": 7.8732e-06, "loss": 0.0026, "num_tokens": 14109848.0, "reward": 2.9029226303100586, "reward_std": 0.061471011489629745, "rewards/reward_fn/mean": 2.9029226303100586, "rewards/reward_fn/std": 0.06147094815969467, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.0338389731621937, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.07620589598082006, "learning_rate": 7.8728e-06, "loss": 0.003, "num_tokens": 14151164.0, "reward": 2.853126049041748, "reward_std": 0.06743326038122177, "rewards/reward_fn/mean": 2.853126049041748, "rewards/reward_fn/std": 0.06743327528238297, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 236.96875, "completions/mean_terminated_length": 236.96875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.033945051447968604, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.08050445187836885, "learning_rate": 7.8724e-06, "loss": 0.0032, "num_tokens": 14195995.0, "reward": 2.7778611183166504, "reward_std": 0.030127666890621185, "rewards/reward_fn/mean": 2.7778611183166504, "rewards/reward_fn/std": 0.030127670615911484, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 194.78125, "completions/mean_terminated_length": 194.78125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.0340511297337435, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.08380766562186182, "learning_rate": 7.872e-06, "loss": 0.0034, "num_tokens": 14251092.0, "reward": 3.6946752071380615, "reward_std": 0.21935689449310303, "rewards/reward_fn/mean": 3.6946752071380615, "rewards/reward_fn/std": 0.21935686469078064, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.0341572080195184, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.09025908191688359, "learning_rate": 7.8716e-06, "loss": 0.0036, "num_tokens": 14304740.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 117.21875, "completions/mean_terminated_length": 117.21875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.03426328630529331, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689453125, "kl": 0.07025603024521843, "learning_rate": 7.8712e-06, "loss": 0.0028, "num_tokens": 14329771.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 252.6875, "completions/mean_terminated_length": 252.6875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.03436936459106821, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.079339619143866, "learning_rate": 7.8708e-06, "loss": 0.0032, "num_tokens": 14379169.0, "reward": 3.0271012783050537, "reward_std": 0.025822646915912628, "rewards/reward_fn/mean": 3.0271012783050537, "rewards/reward_fn/std": 0.025822622701525688, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.03447544287684311, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.11712923878803849, "learning_rate": 7.8704e-06, "loss": 0.0047, "num_tokens": 14423781.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 229.0625, "completions/mean_terminated_length": 229.0625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.03458152116261801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.06614450388588011, "learning_rate": 7.87e-06, "loss": 0.0026, "num_tokens": 14464295.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.03468759944839291, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.06275707174791023, "learning_rate": 7.8696e-06, "loss": 0.0025, "num_tokens": 14514919.0, "reward": 3.0044941902160645, "reward_std": 0.014064479619264603, "rewards/reward_fn/mean": 3.0044941902160645, "rewards/reward_fn/std": 0.014064503833651543, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 87.1875, "completions/mean_terminated_length": 87.1875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.03479367773416782, "frac_reward_zero_std": 1.0, "grad_norm": 0.171875, "kl": 0.030889727699104697, "learning_rate": 7.869199999999999e-06, "loss": 0.0012, "num_tokens": 14552333.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 352.6875, "completions/mean_terminated_length": 352.6875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.03489975601994272, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.06849374365992844, "learning_rate": 7.8688e-06, "loss": 0.0027, "num_tokens": 14593507.0, "reward": 2.9629576206207275, "reward_std": 0.034861400723457336, "rewards/reward_fn/mean": 2.9629576206207275, "rewards/reward_fn/std": 0.03486141189932823, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 195.09375, "completions/mean_terminated_length": 195.09375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.03500583430571762, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.09164670249447227, "learning_rate": 7.8684e-06, "loss": 0.0037, "num_tokens": 14638374.0, "reward": 3.929513931274414, "reward_std": 0.3987296223640442, "rewards/reward_fn/mean": 3.929513931274414, "rewards/reward_fn/std": 0.3987296521663666, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 150.8125, "completions/mean_terminated_length": 150.8125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.035111912591492524, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.08404018310829997, "learning_rate": 7.868e-06, "loss": 0.0034, "num_tokens": 14675680.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 194.46875, "completions/mean_terminated_length": 194.46875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.03521799087726742, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.09944120491854846, "learning_rate": 7.8676e-06, "loss": 0.004, "num_tokens": 14715183.0, "reward": 2.8881282806396484, "reward_std": 0.014438438229262829, "rewards/reward_fn/mean": 2.8881282806396484, "rewards/reward_fn/std": 0.014438426122069359, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 578.6875, "completions/mean_terminated_length": 578.6875, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.03532406916304232, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.04957582626957446, "learning_rate": 7.8672e-06, "loss": 0.002, "num_tokens": 14777285.0, "reward": 3.68550968170166, "reward_std": 0.7617502808570862, "rewards/reward_fn/mean": 3.68550968170166, "rewards/reward_fn/std": 0.7617502212524414, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 276.0625, "completions/mean_terminated_length": 276.0625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.03543014744881723, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.051250798685941845, "learning_rate": 7.866799999999999e-06, "loss": 0.0021, "num_tokens": 14824295.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.03553622573459213, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.07835527614224702, "learning_rate": 7.8664e-06, "loss": 0.0031, "num_tokens": 14873447.0, "reward": 3.213829517364502, "reward_std": 0.2138700932264328, "rewards/reward_fn/mean": 3.213829517364502, "rewards/reward_fn/std": 0.2138700783252716, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 470.9375, "completions/mean_terminated_length": 470.9375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.035642304020367034, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.07522962475195527, "learning_rate": 7.865999999999999e-06, "loss": 0.003, "num_tokens": 14934085.0, "reward": 2.775710105895996, "reward_std": 0.02678767405450344, "rewards/reward_fn/mean": 2.775710105895996, "rewards/reward_fn/std": 0.02678770385682583, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 217.28125, "completions/mean_terminated_length": 217.28125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.03574838230614193, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.08199672133196145, "learning_rate": 7.8656e-06, "loss": 0.0033, "num_tokens": 14970926.0, "reward": 2.9818575382232666, "reward_std": 0.03788159415125847, "rewards/reward_fn/mean": 2.9818575382232666, "rewards/reward_fn/std": 0.037881579250097275, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 127.6875, "completions/mean_terminated_length": 127.6875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.03585446059191683, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.10428278136532754, "learning_rate": 7.865199999999999e-06, "loss": 0.0042, "num_tokens": 15020196.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.03596053887769174, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.1081386434379965, "learning_rate": 7.8648e-06, "loss": 0.0043, "num_tokens": 15065544.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 288.25, "completions/mean_terminated_length": 288.25, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.03606661716346664, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.059167297556996346, "learning_rate": 7.864399999999999e-06, "loss": 0.0024, "num_tokens": 15111632.0, "reward": 2.826021194458008, "reward_std": 0.021319499239325523, "rewards/reward_fn/mean": 2.826021194458008, "rewards/reward_fn/std": 0.021319523453712463, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 210.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.03617269544924154, "frac_reward_zero_std": 1.0, "grad_norm": 0.18359375, "kl": 0.09864125947933644, "learning_rate": 7.864e-06, "loss": 0.0039, "num_tokens": 15136988.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 190.5, "completions/mean_terminated_length": 190.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.036278773735016444, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.07318210729863495, "learning_rate": 7.8636e-06, "loss": 0.0029, "num_tokens": 15181260.0, "reward": 3.665925979614258, "reward_std": 0.5428962707519531, "rewards/reward_fn/mean": 3.665925979614258, "rewards/reward_fn/std": 0.5428962707519531, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 613.65625, "completions/mean_terminated_length": 567.3870849609375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.03638485202079134, "frac_reward_zero_std": 0.0, "grad_norm": 0.6328125, "kl": 0.04777409916277975, "learning_rate": 7.8632e-06, "loss": 0.0019, "num_tokens": 15241761.0, "reward": 2.752007484436035, "reward_std": 0.5035932064056396, "rewards/reward_fn/mean": 2.752007484436035, "rewards/reward_fn/std": 0.5035931468009949, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 160.34375, "completions/mean_terminated_length": 160.34375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.03649093030656624, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.07249265850987285, "learning_rate": 7.8628e-06, "loss": 0.0029, "num_tokens": 15281484.0, "reward": 3.5909247398376465, "reward_std": 0.5742731690406799, "rewards/reward_fn/mean": 3.5909247398376465, "rewards/reward_fn/std": 0.5742731690406799, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 260.0, "completions/mean_terminated_length": 260.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.03659700859234115, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.05745165317784995, "learning_rate": 7.8624e-06, "loss": 0.0023, "num_tokens": 15301420.0, "reward": 3.959191083908081, "reward_std": 0.2308497428894043, "rewards/reward_fn/mean": 3.959191083908081, "rewards/reward_fn/std": 0.23084969818592072, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 462.4375, "completions/mean_terminated_length": 411.2903137207031, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.03670308687811605, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.05426511203404516, "learning_rate": 7.862e-06, "loss": 0.0022, "num_tokens": 15368378.0, "reward": 2.8959522247314453, "reward_std": 0.5591188669204712, "rewards/reward_fn/mean": 2.8959522247314453, "rewards/reward_fn/std": 0.5591188669204712, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 143.9375, "completions/mean_terminated_length": 143.9375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.036809165163890954, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.07412998762447387, "learning_rate": 7.8616e-06, "loss": 0.003, "num_tokens": 15406200.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 171.6875, "completions/mean_terminated_length": 171.6875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.036915243449665854, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.06604230729863048, "learning_rate": 7.8612e-06, "loss": 0.0026, "num_tokens": 15444398.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.03702132173544075, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.08114727039355785, "learning_rate": 7.8608e-06, "loss": 0.0032, "num_tokens": 15485882.0, "reward": 3.92812180519104, "reward_std": 0.2829616963863373, "rewards/reward_fn/mean": 3.92812180519104, "rewards/reward_fn/std": 0.28296172618865967, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 428.0, "completions/mean_terminated_length": 428.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.03712740002121566, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.05857816792558879, "learning_rate": 7.8604e-06, "loss": 0.0023, "num_tokens": 15530746.0, "reward": 3.9291768074035645, "reward_std": 0.400637149810791, "rewards/reward_fn/mean": 3.9291768074035645, "rewards/reward_fn/std": 0.400637149810791, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 223.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.03723347830699056, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.059653010219335556, "learning_rate": 7.86e-06, "loss": 0.0024, "num_tokens": 15584002.0, "reward": 3.0378000736236572, "reward_std": 0.08900023251771927, "rewards/reward_fn/mean": 3.0378000736236572, "rewards/reward_fn/std": 0.08900019526481628, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 235.3125, "completions/mean_terminated_length": 235.3125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.03733955659276546, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.05214848497416824, "learning_rate": 7.8596e-06, "loss": 0.0021, "num_tokens": 15645964.0, "reward": 2.89939546585083, "reward_std": 0.025253912433981895, "rewards/reward_fn/mean": 2.89939546585083, "rewards/reward_fn/std": 0.025253916159272194, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 337.71875, "completions/mean_terminated_length": 337.71875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.037445634878540364, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.07626998424530029, "learning_rate": 7.8592e-06, "loss": 0.0031, "num_tokens": 15678659.0, "reward": 3.342073440551758, "reward_std": 0.8421680927276611, "rewards/reward_fn/mean": 3.342073440551758, "rewards/reward_fn/std": 0.8421680927276611, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 427.28125, "completions/mean_terminated_length": 427.28125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.03755171316431526, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.04384071903768927, "learning_rate": 7.8588e-06, "loss": 0.0018, "num_tokens": 15734284.0, "reward": 3.242161750793457, "reward_std": 0.7822414636611938, "rewards/reward_fn/mean": 3.242161750793457, "rewards/reward_fn/std": 0.7822414040565491, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 111.21875, "completions/mean_terminated_length": 111.21875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.03765779145009017, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.02709541004151106, "learning_rate": 7.8584e-06, "loss": 0.0011, "num_tokens": 15773587.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 253.03125, "completions/mean_terminated_length": 253.03125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.03776386973586507, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.05555059225298464, "learning_rate": 7.858e-06, "loss": 0.0022, "num_tokens": 15821876.0, "reward": 3.5505006313323975, "reward_std": 0.5528962016105652, "rewards/reward_fn/mean": 3.5505006313323975, "rewards/reward_fn/std": 0.5528962016105652, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 190.5, "completions/mean_terminated_length": 190.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.03786994802163997, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.05794315051753074, "learning_rate": 7.857599999999999e-06, "loss": 0.0023, "num_tokens": 15858660.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 179.6875, "completions/mean_terminated_length": 179.6875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.037976026307414874, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.06639315851498395, "learning_rate": 7.8572e-06, "loss": 0.0027, "num_tokens": 15908570.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 550.25, "completions/mean_terminated_length": 550.25, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.038082104593189774, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.05704544053878635, "learning_rate": 7.856799999999999e-06, "loss": 0.0023, "num_tokens": 15951650.0, "reward": 2.769361972808838, "reward_std": 0.1761067509651184, "rewards/reward_fn/mean": 2.769361972808838, "rewards/reward_fn/std": 0.1761067658662796, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 109.46875, "completions/mean_terminated_length": 109.46875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.03818818287896467, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.017623669904423878, "learning_rate": 7.8564e-06, "loss": 0.0007, "num_tokens": 15981841.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 238.1875, "completions/mean_terminated_length": 238.1875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.03829426116473958, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.07996979088056833, "learning_rate": 7.855999999999999e-06, "loss": 0.0032, "num_tokens": 16025623.0, "reward": 2.948993444442749, "reward_std": 0.07517999410629272, "rewards/reward_fn/mean": 2.948993444442749, "rewards/reward_fn/std": 0.07518000900745392, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 92.25, "completions/mean_terminated_length": 92.25, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.03840033945051448, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.07063624216243625, "learning_rate": 7.8556e-06, "loss": 0.0028, "num_tokens": 16066367.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 106.28125, "completions/mean_terminated_length": 106.28125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.038506417736289385, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.023690353438723832, "learning_rate": 7.855199999999999e-06, "loss": 0.0009, "num_tokens": 16105928.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 235.1875, "completions/mean_terminated_length": 235.1875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.038612496022064284, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.06796126707922667, "learning_rate": 7.8548e-06, "loss": 0.0027, "num_tokens": 16150414.0, "reward": 3.9616689682006836, "reward_std": 0.21683254837989807, "rewards/reward_fn/mean": 3.9616689682006836, "rewards/reward_fn/std": 0.21683259308338165, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 465.9375, "completions/mean_terminated_length": 465.9375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.03871857430783918, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.043375019915401936, "learning_rate": 7.854399999999999e-06, "loss": 0.0017, "num_tokens": 16226796.0, "reward": 3.8493480682373047, "reward_std": 0.4049968421459198, "rewards/reward_fn/mean": 3.8493480682373047, "rewards/reward_fn/std": 0.4049968123435974, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 146.46875, "completions/mean_terminated_length": 146.46875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.03882465259361409, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.06945032655494288, "learning_rate": 7.854e-06, "loss": 0.0028, "num_tokens": 16268891.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 274.15625, "completions/mean_terminated_length": 274.15625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.03893073087938899, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.04315848933765665, "learning_rate": 7.8536e-06, "loss": 0.0017, "num_tokens": 16311296.0, "reward": 3.2710511684417725, "reward_std": 0.4644310474395752, "rewards/reward_fn/mean": 3.2710511684417725, "rewards/reward_fn/std": 0.4644309878349304, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 167.78125, "completions/mean_terminated_length": 167.78125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.03903680916516389, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.051279135106597096, "learning_rate": 7.8532e-06, "loss": 0.0021, "num_tokens": 16337785.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 118.5, "completions/mean_terminated_length": 118.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.039142887450938794, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.09624231781344861, "learning_rate": 7.8528e-06, "loss": 0.0038, "num_tokens": 16367145.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 211.8125, "completions/mean_terminated_length": 211.8125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.039248965736713694, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.08367881097365171, "learning_rate": 7.8524e-06, "loss": 0.0033, "num_tokens": 16403747.0, "reward": 3.0466554164886475, "reward_std": 0.030376369133591652, "rewards/reward_fn/mean": 3.0466554164886475, "rewards/reward_fn/std": 0.0303763709962368, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 148.3125, "completions/mean_terminated_length": 148.3125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.0393550440224886, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.05079342168755829, "learning_rate": 7.852e-06, "loss": 0.002, "num_tokens": 16452813.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.0394611223082635, "frac_reward_zero_std": 1.0, "grad_norm": 0.1884765625, "kl": 0.0790958609431982, "learning_rate": 7.8516e-06, "loss": 0.0032, "num_tokens": 16480749.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 172.09375, "completions/mean_terminated_length": 172.09375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.0395672005940384, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.07499957724940032, "learning_rate": 7.8512e-06, "loss": 0.003, "num_tokens": 16527984.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 449.53125, "completions/mean_terminated_length": 449.53125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.039673278879813305, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.05416272731963545, "learning_rate": 7.8508e-06, "loss": 0.0022, "num_tokens": 16589505.0, "reward": 1.9449317455291748, "reward_std": 0.4181321859359741, "rewards/reward_fn/mean": 1.9449317455291748, "rewards/reward_fn/std": 0.4181321859359741, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 230.84375, "completions/mean_terminated_length": 230.84375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.039779357165588204, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.08303264563437551, "learning_rate": 7.850399999999999e-06, "loss": 0.0033, "num_tokens": 16641596.0, "reward": 3.153766393661499, "reward_std": 0.05732966959476471, "rewards/reward_fn/mean": 3.153766393661499, "rewards/reward_fn/std": 0.0573296882212162, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 213.19354248046875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.039885435451363103, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.1146129370899871, "learning_rate": 7.85e-06, "loss": 0.0046, "num_tokens": 16681421.0, "reward": 2.9682884216308594, "reward_std": 0.6505692005157471, "rewards/reward_fn/mean": 2.9682884216308594, "rewards/reward_fn/std": 0.6505692005157471, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 232.90625, "completions/mean_terminated_length": 232.90625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.03999151373713801, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.06879323907196522, "learning_rate": 7.849599999999999e-06, "loss": 0.0027, "num_tokens": 16720330.0, "reward": 2.8407626152038574, "reward_std": 0.019035818055272102, "rewards/reward_fn/mean": 2.8407626152038574, "rewards/reward_fn/std": 0.019035782665014267, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 382.0, "completions/mean_terminated_length": 382.0, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.04009759202291291, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.053302777581848204, "learning_rate": 7.8492e-06, "loss": 0.0021, "num_tokens": 16764714.0, "reward": 2.8831186294555664, "reward_std": 0.20725314319133759, "rewards/reward_fn/mean": 2.8831186294555664, "rewards/reward_fn/std": 0.2072531282901764, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 484.65625, "completions/mean_terminated_length": 484.65625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.04020367030868781, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.04577759699895978, "learning_rate": 7.8488e-06, "loss": 0.0018, "num_tokens": 16814047.0, "reward": 3.974400520324707, "reward_std": 0.144812673330307, "rewards/reward_fn/mean": 3.974400520324707, "rewards/reward_fn/std": 0.1448127031326294, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 769.53125, "completions/mean_terminated_length": 684.300048828125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.040309748594462715, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.058462157379835844, "learning_rate": 7.8484e-06, "loss": 0.0023, "num_tokens": 16880560.0, "reward": 2.516724109649658, "reward_std": 0.7297559976577759, "rewards/reward_fn/mean": 2.516724109649658, "rewards/reward_fn/std": 0.7297559976577759, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1420.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 323.53125, "completions/mean_terminated_length": 323.53125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.040415826880237614, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.06136702420189977, "learning_rate": 7.848e-06, "loss": 0.0025, "num_tokens": 16927777.0, "reward": 3.8343114852905273, "reward_std": 0.5490050911903381, "rewards/reward_fn/mean": 3.8343114852905273, "rewards/reward_fn/std": 0.5490050911903381, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 190.25, "completions/mean_terminated_length": 190.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.04052190516601252, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.04089883284177631, "learning_rate": 7.8476e-06, "loss": 0.0016, "num_tokens": 16949001.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 165.78125, "completions/mean_terminated_length": 165.78125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.04062798345178742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.03171997581375763, "learning_rate": 7.8472e-06, "loss": 0.0013, "num_tokens": 16979586.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 920.875, "completions/mean_terminated_length": 920.875, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.04073406173756232, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.03487598354695365, "learning_rate": 7.846799999999999e-06, "loss": 0.0014, "num_tokens": 17051486.0, "reward": 2.8845808506011963, "reward_std": 0.4707050025463104, "rewards/reward_fn/mean": 2.8845808506011963, "rewards/reward_fn/std": 0.47070497274398804, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 307.15625, "completions/mean_terminated_length": 307.15625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.040840140023337225, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.04412680730456486, "learning_rate": 7.8464e-06, "loss": 0.0018, "num_tokens": 17096515.0, "reward": 2.8527889251708984, "reward_std": 0.02638748660683632, "rewards/reward_fn/mean": 2.8527889251708984, "rewards/reward_fn/std": 0.026387471705675125, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 451.25, "completions/mean_terminated_length": 451.25, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.040946218309112124, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.04106189822778106, "learning_rate": 7.845999999999999e-06, "loss": 0.0016, "num_tokens": 17144203.0, "reward": 2.9010472297668457, "reward_std": 0.03634057193994522, "rewards/reward_fn/mean": 2.9010472297668457, "rewards/reward_fn/std": 0.03634057566523552, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 442.40625, "completions/mean_terminated_length": 442.40625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.041052296594887024, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.05888936563860625, "learning_rate": 7.8456e-06, "loss": 0.0024, "num_tokens": 17190904.0, "reward": 2.7666423320770264, "reward_std": 0.17257817089557648, "rewards/reward_fn/mean": 2.7666423320770264, "rewards/reward_fn/std": 0.17257815599441528, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 169.5625, "completions/mean_terminated_length": 169.5625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.04115837488066193, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.037654812389519066, "learning_rate": 7.845199999999999e-06, "loss": 0.0015, "num_tokens": 17216970.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 105.46875, "completions/mean_terminated_length": 105.46875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.04126445316643683, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.046366847469471395, "learning_rate": 7.8448e-06, "loss": 0.0019, "num_tokens": 17254361.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.041370531452211735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.047395337373018265, "learning_rate": 7.8444e-06, "loss": 0.0019, "num_tokens": 17289285.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 222.6875, "completions/mean_terminated_length": 222.6875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.041476609737986635, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.04588223801692948, "learning_rate": 7.844e-06, "loss": 0.0018, "num_tokens": 17333147.0, "reward": 3.4899792671203613, "reward_std": 0.5518075227737427, "rewards/reward_fn/mean": 3.4899792671203613, "rewards/reward_fn/std": 0.5518075227737427, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.041582688023761534, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.05707505450118333, "learning_rate": 7.8436e-06, "loss": 0.0023, "num_tokens": 17370575.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 111.40625, "completions/mean_terminated_length": 111.40625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.04168876630953644, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.036936537886504084, "learning_rate": 7.8432e-06, "loss": 0.0015, "num_tokens": 17418748.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 409.90625, "completions/mean_terminated_length": 409.90625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.04179484459531134, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.047665696358308196, "learning_rate": 7.8428e-06, "loss": 0.0019, "num_tokens": 17465273.0, "reward": 2.5570666790008545, "reward_std": 0.24715420603752136, "rewards/reward_fn/mean": 2.5570666790008545, "rewards/reward_fn/std": 0.247154101729393, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 248.03125, "completions/mean_terminated_length": 248.03125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.04190092288108624, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.07105874724220484, "learning_rate": 7.8424e-06, "loss": 0.0028, "num_tokens": 17503386.0, "reward": 2.808501720428467, "reward_std": 0.023406412452459335, "rewards/reward_fn/mean": 2.808501720428467, "rewards/reward_fn/std": 0.023406431078910828, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 247.6875, "completions/mean_terminated_length": 247.6875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.042007001166861145, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.04195737303234637, "learning_rate": 7.841999999999999e-06, "loss": 0.0017, "num_tokens": 17542512.0, "reward": 2.7778372764587402, "reward_std": 0.22531868517398834, "rewards/reward_fn/mean": 2.7778372764587402, "rewards/reward_fn/std": 0.22531865537166595, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 78.34375, "completions/mean_terminated_length": 78.34375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.042113079452636044, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "kl": 0.044936935941223055, "learning_rate": 7.8416e-06, "loss": 0.0018, "num_tokens": 17579419.0, "reward": 3.0178000926971436, "reward_std": 0.008912133984267712, "rewards/reward_fn/mean": 3.0178000926971436, "rewards/reward_fn/std": 0.00891213957220316, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 376.65625, "completions/mean_terminated_length": 376.65625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.04221915773841095, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.046278940804768354, "learning_rate": 7.841199999999999e-06, "loss": 0.0019, "num_tokens": 17624592.0, "reward": 2.8509998321533203, "reward_std": 0.06351502239704132, "rewards/reward_fn/mean": 2.8509998321533203, "rewards/reward_fn/std": 0.06351498514413834, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 275.40625, "completions/mean_terminated_length": 275.40625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.04232523602418585, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.07029017817694694, "learning_rate": 7.8408e-06, "loss": 0.0028, "num_tokens": 17684509.0, "reward": 3.8989593982696533, "reward_std": 0.3194463849067688, "rewards/reward_fn/mean": 3.8989593982696533, "rewards/reward_fn/std": 0.3194463551044464, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 455.0, "completions/mean_terminated_length": 455.0, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.04243131430996075, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.054996508290059865, "learning_rate": 7.840399999999999e-06, "loss": 0.0022, "num_tokens": 17739645.0, "reward": 3.0377087593078613, "reward_std": 0.31760552525520325, "rewards/reward_fn/mean": 3.0377087593078613, "rewards/reward_fn/std": 0.31760555505752563, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 189.3125, "completions/mean_terminated_length": 189.3125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.042537392595735656, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.06360255589243025, "learning_rate": 7.84e-06, "loss": 0.0025, "num_tokens": 17798055.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 244.84375, "completions/mean_terminated_length": 244.84375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.042643470881510555, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.057871548808179796, "learning_rate": 7.8396e-06, "loss": 0.0023, "num_tokens": 17840002.0, "reward": 3.0404982566833496, "reward_std": 0.36853575706481934, "rewards/reward_fn/mean": 3.0404982566833496, "rewards/reward_fn/std": 0.36853572726249695, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 244.84375, "completions/mean_terminated_length": 244.84375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.042749549167285454, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.049547198228538036, "learning_rate": 7.8392e-06, "loss": 0.002, "num_tokens": 17884829.0, "reward": 2.909618377685547, "reward_std": 0.01886744424700737, "rewards/reward_fn/mean": 2.909618377685547, "rewards/reward_fn/std": 0.018867461010813713, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 577.65625, "completions/mean_terminated_length": 577.65625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.04285562745306036, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.04880936082918197, "learning_rate": 7.8388e-06, "loss": 0.002, "num_tokens": 17938258.0, "reward": 2.958042621612549, "reward_std": 0.6647323966026306, "rewards/reward_fn/mean": 2.958042621612549, "rewards/reward_fn/std": 0.6647323369979858, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 148.6875, "completions/mean_terminated_length": 148.6875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.04296170573883526, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.08973181049805135, "learning_rate": 7.8384e-06, "loss": 0.0036, "num_tokens": 17989672.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 242.0, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.04306778402461016, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.07667279534507543, "learning_rate": 7.838e-06, "loss": 0.0031, "num_tokens": 18012776.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 248.375, "completions/mean_terminated_length": 248.375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.043173862310385065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.0704468511394225, "learning_rate": 7.8376e-06, "loss": 0.0028, "num_tokens": 18057108.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 240.4375, "completions/mean_terminated_length": 240.4375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.043279940596159965, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.035191603121347725, "learning_rate": 7.8372e-06, "loss": 0.0014, "num_tokens": 18145698.0, "reward": 3.6485748291015625, "reward_std": 0.8297750353813171, "rewards/reward_fn/mean": 3.6485748291015625, "rewards/reward_fn/std": 0.8297750353813171, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 147.96875, "completions/mean_terminated_length": 147.96875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.04338601888193487, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.07083654997404665, "learning_rate": 7.8368e-06, "loss": 0.0028, "num_tokens": 18169281.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.04349209716770977, "frac_reward_zero_std": 1.0, "grad_norm": 0.1572265625, "kl": 0.0727719494025223, "learning_rate": 7.8364e-06, "loss": 0.0029, "num_tokens": 18211149.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 221.46875, "completions/mean_terminated_length": 221.46875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.04359817545348467, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.0683895600377582, "learning_rate": 7.836e-06, "loss": 0.0027, "num_tokens": 18253244.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 71.5625, "completions/mean_terminated_length": 71.5625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.043704253739259576, "frac_reward_zero_std": 1.0, "grad_norm": 0.2236328125, "kl": 0.034835965518141165, "learning_rate": 7.8356e-06, "loss": 0.0014, "num_tokens": 18287022.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 397.03125, "completions/mean_terminated_length": 397.03125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.043810332025034475, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.07421079277992249, "learning_rate": 7.8352e-06, "loss": 0.003, "num_tokens": 18335087.0, "reward": 2.8003878593444824, "reward_std": 0.02719496749341488, "rewards/reward_fn/mean": 2.8003878593444824, "rewards/reward_fn/std": 0.027194969356060028, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 126.5625, "completions/mean_terminated_length": 126.5625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.043916410310809374, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.08099708310328424, "learning_rate": 7.8348e-06, "loss": 0.0032, "num_tokens": 18373889.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 160.53125, "completions/mean_terminated_length": 160.53125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.04402248859658428, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.07151383883319795, "learning_rate": 7.834399999999999e-06, "loss": 0.0029, "num_tokens": 18411602.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 215.1875, "completions/mean_terminated_length": 215.1875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.04412856688235918, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.03778650192543864, "learning_rate": 7.834e-06, "loss": 0.0015, "num_tokens": 18458776.0, "reward": 3.094648838043213, "reward_std": 0.09678830951452255, "rewards/reward_fn/mean": 3.094648838043213, "rewards/reward_fn/std": 0.09678832441568375, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 344.4375, "completions/mean_terminated_length": 344.4375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.044234645168134086, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.0643120965687558, "learning_rate": 7.833599999999999e-06, "loss": 0.0026, "num_tokens": 18508710.0, "reward": 1.7739882469177246, "reward_std": 0.015309223905205727, "rewards/reward_fn/mean": 1.7739882469177246, "rewards/reward_fn/std": 0.015309221111238003, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 261.65625, "completions/mean_terminated_length": 261.65625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.044340723453908985, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.07708191289566457, "learning_rate": 7.8332e-06, "loss": 0.0031, "num_tokens": 18552283.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 255.5625, "completions/mean_terminated_length": 255.5625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.044446801739683885, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.030837459082249552, "learning_rate": 7.832799999999999e-06, "loss": 0.0012, "num_tokens": 18593517.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 139.03125, "completions/mean_terminated_length": 139.03125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.04455288002545879, "frac_reward_zero_std": 1.0, "grad_norm": 0.1630859375, "kl": 0.07660691998898983, "learning_rate": 7.8324e-06, "loss": 0.0031, "num_tokens": 18633038.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 147.34375, "completions/mean_terminated_length": 147.34375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.04465895831123369, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.062265314627438784, "learning_rate": 7.831999999999999e-06, "loss": 0.0025, "num_tokens": 18662009.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 167.34375, "completions/mean_terminated_length": 167.34375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.04476503659700859, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.038907009293325245, "learning_rate": 7.8316e-06, "loss": 0.0016, "num_tokens": 18715684.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 165.1875, "completions/mean_terminated_length": 165.1875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.044871114882783496, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.045978159294463694, "learning_rate": 7.831199999999999e-06, "loss": 0.0018, "num_tokens": 18759914.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 223.0625, "completions/mean_terminated_length": 223.0625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.044977193168558395, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.06679196062032133, "learning_rate": 7.8308e-06, "loss": 0.0027, "num_tokens": 18810924.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 413.96875, "completions/mean_terminated_length": 413.96875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.0450832714543333, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.050274993176572025, "learning_rate": 7.8304e-06, "loss": 0.002, "num_tokens": 18858699.0, "reward": 3.965904474258423, "reward_std": 0.1928737610578537, "rewards/reward_fn/mean": 3.965904474258423, "rewards/reward_fn/std": 0.1928737461566925, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 98.21875, "completions/mean_terminated_length": 98.21875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.0451893497401082, "frac_reward_zero_std": 1.0, "grad_norm": 0.1708984375, "kl": 0.03456445218762383, "learning_rate": 7.83e-06, "loss": 0.0014, "num_tokens": 18906642.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 267.96875, "completions/mean_terminated_length": 267.96875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.0452954280258831, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.07573176105506718, "learning_rate": 7.8296e-06, "loss": 0.003, "num_tokens": 18946129.0, "reward": 2.3788418769836426, "reward_std": 0.5687231421470642, "rewards/reward_fn/mean": 2.3788418769836426, "rewards/reward_fn/std": 0.5687231421470642, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.045401506311658006, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.06303711561486125, "learning_rate": 7.8292e-06, "loss": 0.0025, "num_tokens": 18986625.0, "reward": 3.611386775970459, "reward_std": 0.5146521925926208, "rewards/reward_fn/mean": 3.611386775970459, "rewards/reward_fn/std": 0.5146521925926208, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 254.53125, "completions/mean_terminated_length": 254.53125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.045507584597432905, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.040832315338775516, "learning_rate": 7.8288e-06, "loss": 0.0016, "num_tokens": 19041426.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 800.96875, "completions/mean_terminated_length": 800.96875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.045613662883207805, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.055984109407290816, "learning_rate": 7.8284e-06, "loss": 0.0022, "num_tokens": 19111281.0, "reward": 2.264820098876953, "reward_std": 0.5752713084220886, "rewards/reward_fn/mean": 2.264820098876953, "rewards/reward_fn/std": 0.5752713084220886, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.04571974116898271, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.0701156510040164, "learning_rate": 7.828e-06, "loss": 0.0028, "num_tokens": 19154909.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 200.21875, "completions/mean_terminated_length": 200.21875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.04582581945475761, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.04736822139238939, "learning_rate": 7.8276e-06, "loss": 0.0019, "num_tokens": 19181796.0, "reward": 3.932534694671631, "reward_std": 0.26547369360923767, "rewards/reward_fn/mean": 3.932534694671631, "rewards/reward_fn/std": 0.26547372341156006, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 300.53125, "completions/mean_terminated_length": 300.53125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.04593189774053251, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.029562294948846102, "learning_rate": 7.8272e-06, "loss": 0.0012, "num_tokens": 19240213.0, "reward": 2.923275947570801, "reward_std": 0.02933095581829548, "rewards/reward_fn/mean": 2.923275947570801, "rewards/reward_fn/std": 0.02933092787861824, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.046037976026307416, "frac_reward_zero_std": 1.0, "grad_norm": 0.1728515625, "kl": 0.0631599115440622, "learning_rate": 7.8268e-06, "loss": 0.0025, "num_tokens": 19266869.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 203.8125, "completions/mean_terminated_length": 203.8125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.046144054312082315, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.09093636996112764, "learning_rate": 7.826399999999998e-06, "loss": 0.0036, "num_tokens": 19304239.0, "reward": 3.0638785362243652, "reward_std": 0.0846947655081749, "rewards/reward_fn/mean": 3.0638785362243652, "rewards/reward_fn/std": 0.0846947655081749, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.04625013259785722, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0460854871198535, "learning_rate": 7.826e-06, "loss": 0.0018, "num_tokens": 19357891.0, "reward": 3.0054774284362793, "reward_std": 0.10288947075605392, "rewards/reward_fn/mean": 3.0054774284362793, "rewards/reward_fn/std": 0.10288945585489273, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 255.4375, "completions/mean_terminated_length": 255.4375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.04635621088363212, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.05067626805976033, "learning_rate": 7.8256e-06, "loss": 0.002, "num_tokens": 19417105.0, "reward": 3.9661037921905518, "reward_std": 0.1917456090450287, "rewards/reward_fn/mean": 3.9661037921905518, "rewards/reward_fn/std": 0.1917456090450287, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 306.0625, "completions/mean_terminated_length": 306.0625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.04646228916940702, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.060956618282943964, "learning_rate": 7.8252e-06, "loss": 0.0024, "num_tokens": 19462579.0, "reward": 3.7817397117614746, "reward_std": 0.3843442499637604, "rewards/reward_fn/mean": 3.7817397117614746, "rewards/reward_fn/std": 0.38434427976608276, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.046568367455181926, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.051055049581918865, "learning_rate": 7.8248e-06, "loss": 0.002, "num_tokens": 19502815.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 156.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.046674445740956826, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.0625815651146695, "learning_rate": 7.824399999999999e-06, "loss": 0.0025, "num_tokens": 19548203.0, "reward": 3.163956642150879, "reward_std": 0.020623939111828804, "rewards/reward_fn/mean": 3.163956642150879, "rewards/reward_fn/std": 0.020623959600925446, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 150.8125, "completions/mean_terminated_length": 150.8125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.046780524026731725, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.06158293504267931, "learning_rate": 7.824e-06, "loss": 0.0025, "num_tokens": 19589381.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 133.1875, "completions/mean_terminated_length": 133.1875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.04688660231250663, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.05522835114970803, "learning_rate": 7.823599999999999e-06, "loss": 0.0022, "num_tokens": 19629899.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 265.40625, "completions/mean_terminated_length": 265.40625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.04699268059828153, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.054093458864372224, "learning_rate": 7.8232e-06, "loss": 0.0022, "num_tokens": 19673176.0, "reward": 3.9060635566711426, "reward_std": 0.2969805896282196, "rewards/reward_fn/mean": 3.9060635566711426, "rewards/reward_fn/std": 0.29698053002357483, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 266.53125, "completions/mean_terminated_length": 266.53125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.04709875888405644, "frac_reward_zero_std": 1.0, "grad_norm": 0.11572265625, "kl": 0.03358338208636269, "learning_rate": 7.822799999999999e-06, "loss": 0.0013, "num_tokens": 19714153.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 170.90625, "completions/mean_terminated_length": 170.90625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.047204837169831336, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.06025209149811417, "learning_rate": 7.8224e-06, "loss": 0.0024, "num_tokens": 19752198.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 170.59375, "completions/mean_terminated_length": 170.59375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.047310915455606235, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.03811126423534006, "learning_rate": 7.821999999999999e-06, "loss": 0.0015, "num_tokens": 19774713.0, "reward": 3.1091926097869873, "reward_std": 0.03035632148385048, "rewards/reward_fn/mean": 3.1091926097869873, "rewards/reward_fn/std": 0.030356300994753838, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 382.125, "completions/mean_terminated_length": 382.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.04741699374138114, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.038349084730725735, "learning_rate": 7.8216e-06, "loss": 0.0015, "num_tokens": 19808125.0, "reward": 2.8521904945373535, "reward_std": 0.07935212552547455, "rewards/reward_fn/mean": 2.8521904945373535, "rewards/reward_fn/std": 0.07935213297605515, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 138.40625, "completions/mean_terminated_length": 138.40625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.04752307202715604, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.053723981720395386, "learning_rate": 7.8212e-06, "loss": 0.0021, "num_tokens": 19851658.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.04762915031293094, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.05036897730315104, "learning_rate": 7.8208e-06, "loss": 0.002, "num_tokens": 19909602.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 83.46875, "completions/mean_terminated_length": 83.46875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.047735228598705846, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.019947283988585696, "learning_rate": 7.8204e-06, "loss": 0.0008, "num_tokens": 19944849.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.047841306884480746, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.03120685095200315, "learning_rate": 7.82e-06, "loss": 0.0012, "num_tokens": 19989013.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 483.8125, "completions/mean_terminated_length": 483.8125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.04794738517025565, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.06700396462110803, "learning_rate": 7.8196e-06, "loss": 0.0027, "num_tokens": 20041839.0, "reward": 2.9224772453308105, "reward_std": 0.4150664508342743, "rewards/reward_fn/mean": 2.9224772453308105, "rewards/reward_fn/std": 0.4150664508342743, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 179.53125, "completions/mean_terminated_length": 179.53125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.04805346345603055, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.060416163643822074, "learning_rate": 7.8192e-06, "loss": 0.0024, "num_tokens": 20064992.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.04815954174180545, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.06531961727887392, "learning_rate": 7.8188e-06, "loss": 0.0026, "num_tokens": 20108816.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 160.03125, "completions/mean_terminated_length": 160.03125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.04826562002758036, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.058511899900622666, "learning_rate": 7.8184e-06, "loss": 0.0023, "num_tokens": 20144977.0, "reward": 2.8220136165618896, "reward_std": 0.04616091400384903, "rewards/reward_fn/mean": 2.8220136165618896, "rewards/reward_fn/std": 0.046160902827978134, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 126.15625, "completions/mean_terminated_length": 126.15625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.048371698313355256, "frac_reward_zero_std": 1.0, "grad_norm": 0.345703125, "kl": 0.02876890735933557, "learning_rate": 7.817999999999999e-06, "loss": 0.0012, "num_tokens": 20197078.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 100.84375, "completions/mean_terminated_length": 100.84375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.048477776599130155, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.039327553124167025, "learning_rate": 7.8176e-06, "loss": 0.0016, "num_tokens": 20234897.0, "reward": 3.0898919105529785, "reward_std": 0.03461963310837746, "rewards/reward_fn/mean": 3.0898919105529785, "rewards/reward_fn/std": 0.03461962565779686, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 442.59375, "completions/mean_terminated_length": 442.59375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.04858385488490506, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.05146264052018523, "learning_rate": 7.817199999999999e-06, "loss": 0.0021, "num_tokens": 20283204.0, "reward": 3.024125099182129, "reward_std": 0.13125301897525787, "rewards/reward_fn/mean": 3.024125099182129, "rewards/reward_fn/std": 0.13125301897525787, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 178.5625, "completions/mean_terminated_length": 178.5625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.04868993317067996, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.0506303379079327, "learning_rate": 7.8168e-06, "loss": 0.002, "num_tokens": 20332694.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 189.59375, "completions/mean_terminated_length": 189.59375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.04879601145645486, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.037065216922201216, "learning_rate": 7.8164e-06, "loss": 0.0015, "num_tokens": 20377769.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 274.0625, "completions/mean_terminated_length": 274.0625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.048902089742229767, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.03126209176843986, "learning_rate": 7.816e-06, "loss": 0.0013, "num_tokens": 20428491.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 248.71875, "completions/mean_terminated_length": 248.71875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.049008168028004666, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.05388626334024593, "learning_rate": 7.8156e-06, "loss": 0.0022, "num_tokens": 20460482.0, "reward": 2.950096607208252, "reward_std": 0.07136090844869614, "rewards/reward_fn/mean": 2.950096607208252, "rewards/reward_fn/std": 0.07136087864637375, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 348.0625, "completions/mean_terminated_length": 348.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.04911424631377957, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.04740521451458335, "learning_rate": 7.8152e-06, "loss": 0.0019, "num_tokens": 20493988.0, "reward": 2.8588528633117676, "reward_std": 0.027407808229327202, "rewards/reward_fn/mean": 2.8588528633117676, "rewards/reward_fn/std": 0.027407843619585037, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 387.84375, "completions/mean_terminated_length": 387.84375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.04922032459955447, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.048984733468387276, "learning_rate": 7.8148e-06, "loss": 0.002, "num_tokens": 20542143.0, "reward": 2.9211349487304688, "reward_std": 0.02732696197926998, "rewards/reward_fn/mean": 2.9211349487304688, "rewards/reward_fn/std": 0.02732696942985058, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 484.40625, "completions/mean_terminated_length": 484.40625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.04932640288532937, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.04254200303694233, "learning_rate": 7.8144e-06, "loss": 0.0017, "num_tokens": 20600140.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 235.9375, "completions/mean_terminated_length": 235.9375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.04943248117110428, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.04190182563615963, "learning_rate": 7.814e-06, "loss": 0.0017, "num_tokens": 20640522.0, "reward": 1.7360775470733643, "reward_std": 0.008762822486460209, "rewards/reward_fn/mean": 1.7360775470733643, "rewards/reward_fn/std": 0.008762827143073082, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 77.65625, "completions/mean_terminated_length": 77.65625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.049538559456879176, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.017409066756954417, "learning_rate": 7.8136e-06, "loss": 0.0007, "num_tokens": 20680607.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 457.625, "completions/mean_terminated_length": 457.625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.049644637742654076, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.05563335045007989, "learning_rate": 7.8132e-06, "loss": 0.0022, "num_tokens": 20725331.0, "reward": 3.640636920928955, "reward_std": 0.6801393032073975, "rewards/reward_fn/mean": 3.640636920928955, "rewards/reward_fn/std": 0.6801392436027527, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 187.3125, "completions/mean_terminated_length": 187.3125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.04975071602842898, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.04255328554427251, "learning_rate": 7.812799999999999e-06, "loss": 0.0017, "num_tokens": 20765597.0, "reward": 3.0404469966888428, "reward_std": 0.4203813970088959, "rewards/reward_fn/mean": 3.0404469966888428, "rewards/reward_fn/std": 0.4203813970088959, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 157.09375, "completions/mean_terminated_length": 157.09375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.04985679431420388, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.0492172134690918, "learning_rate": 7.8124e-06, "loss": 0.002, "num_tokens": 20803616.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 302.71875, "completions/mean_terminated_length": 302.71875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.04996287259997879, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.0402672459022142, "learning_rate": 7.812e-06, "loss": 0.0016, "num_tokens": 20853207.0, "reward": 2.911402702331543, "reward_std": 0.01763262040913105, "rewards/reward_fn/mean": 2.911402702331543, "rewards/reward_fn/std": 0.017632605507969856, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.05006895088575369, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.05370669817784801, "learning_rate": 7.8116e-06, "loss": 0.0021, "num_tokens": 20906767.0, "reward": 3.9612836837768555, "reward_std": 0.2190132439136505, "rewards/reward_fn/mean": 3.9612836837768555, "rewards/reward_fn/std": 0.21901322901248932, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 170.90625, "completions/mean_terminated_length": 170.90625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.050175029171528586, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.04981033824151382, "learning_rate": 7.8112e-06, "loss": 0.002, "num_tokens": 20952940.0, "reward": 3.745567798614502, "reward_std": 0.4885614812374115, "rewards/reward_fn/mean": 3.745567798614502, "rewards/reward_fn/std": 0.4885614812374115, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 149.15625, "completions/mean_terminated_length": 149.15625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.05028110745730349, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.031192386173643172, "learning_rate": 7.8108e-06, "loss": 0.0012, "num_tokens": 20996337.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 93.3125, "completions/mean_terminated_length": 93.3125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.05038718574307839, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.02836272062268108, "learning_rate": 7.810399999999999e-06, "loss": 0.0011, "num_tokens": 21020795.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 220.6875, "completions/mean_terminated_length": 220.6875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.05049326402885329, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.048613334831316024, "learning_rate": 7.81e-06, "loss": 0.002, "num_tokens": 21054545.0, "reward": 3.040393829345703, "reward_std": 0.01075148768723011, "rewards/reward_fn/mean": 3.040393829345703, "rewards/reward_fn/std": 0.010751496069133282, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.0505993423146282, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.05732053401879966, "learning_rate": 7.809599999999999e-06, "loss": 0.0023, "num_tokens": 21105821.0, "reward": 2.80159592628479, "reward_std": 0.06686852127313614, "rewards/reward_fn/mean": 2.80159592628479, "rewards/reward_fn/std": 0.06686852127313614, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.050705420600403096, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.030846741399727762, "learning_rate": 7.8092e-06, "loss": 0.0012, "num_tokens": 21144581.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 317.375, "completions/mean_terminated_length": 317.375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.050811498886178, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.03433474572375417, "learning_rate": 7.808799999999999e-06, "loss": 0.0014, "num_tokens": 21189425.0, "reward": 3.131831169128418, "reward_std": 0.16248132288455963, "rewards/reward_fn/mean": 3.131831169128418, "rewards/reward_fn/std": 0.16248130798339844, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 80.34375, "completions/mean_terminated_length": 80.34375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.0509175771719529, "frac_reward_zero_std": 0.0, "grad_norm": 3.1875, "kl": 0.03051563521148637, "learning_rate": 7.8084e-06, "loss": 0.0012, "num_tokens": 21236604.0, "reward": 3.8580493927001953, "reward_std": 0.30022406578063965, "rewards/reward_fn/mean": 3.8580493927001953, "rewards/reward_fn/std": 0.30022403597831726, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.0510236554577278, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.04617619724012911, "learning_rate": 7.807999999999999e-06, "loss": 0.0018, "num_tokens": 21271788.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 157.9375, "completions/mean_terminated_length": 157.9375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.05112973374350271, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.06474354478996247, "learning_rate": 7.8076e-06, "loss": 0.0026, "num_tokens": 21310826.0, "reward": 3.430511474609375, "reward_std": 0.5437823534011841, "rewards/reward_fn/mean": 3.430511474609375, "rewards/reward_fn/std": 0.5437823534011841, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.05123581202927761, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.036409151420230046, "learning_rate": 7.8072e-06, "loss": 0.0015, "num_tokens": 21356002.0, "reward": 1.8208963871002197, "reward_std": 0.017528312280774117, "rewards/reward_fn/mean": 1.8208963871002197, "rewards/reward_fn/std": 0.01752830669283867, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 209.4375, "completions/mean_terminated_length": 209.4375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.051341890315052506, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.03509134179330431, "learning_rate": 7.8068e-06, "loss": 0.0014, "num_tokens": 21402096.0, "reward": 3.185612916946411, "reward_std": 0.013976151123642921, "rewards/reward_fn/mean": 3.185612916946411, "rewards/reward_fn/std": 0.0139761408790946, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 182.59375, "completions/mean_terminated_length": 182.59375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.05144796860082741, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.048504149657674134, "learning_rate": 7.8064e-06, "loss": 0.0019, "num_tokens": 21441379.0, "reward": 2.9261727333068848, "reward_std": 0.19896990060806274, "rewards/reward_fn/mean": 2.9261727333068848, "rewards/reward_fn/std": 0.19896981120109558, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 251.625, "completions/mean_terminated_length": 251.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.05155404688660231, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.04867141193244606, "learning_rate": 7.806e-06, "loss": 0.0019, "num_tokens": 21483063.0, "reward": 3.803684949874878, "reward_std": 0.8027096390724182, "rewards/reward_fn/mean": 3.803684949874878, "rewards/reward_fn/std": 0.802709698677063, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 131.5625, "completions/mean_terminated_length": 131.5625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.05166012517237722, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.05726821324788034, "learning_rate": 7.8056e-06, "loss": 0.0023, "num_tokens": 21521001.0, "reward": 3.840595245361328, "reward_std": 0.3371758460998535, "rewards/reward_fn/mean": 3.840595245361328, "rewards/reward_fn/std": 0.3371758460998535, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 406.90625, "completions/mean_terminated_length": 406.90625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.05176620345815212, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.043418677465524524, "learning_rate": 7.8052e-06, "loss": 0.0017, "num_tokens": 21585126.0, "reward": 3.6899161338806152, "reward_std": 0.5464287996292114, "rewards/reward_fn/mean": 3.6899161338806152, "rewards/reward_fn/std": 0.5464287996292114, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 450.8125, "completions/mean_terminated_length": 450.8125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.051872281743927016, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.05789197108242661, "learning_rate": 7.8048e-06, "loss": 0.0023, "num_tokens": 21633888.0, "reward": 2.7008860111236572, "reward_std": 0.2668880224227905, "rewards/reward_fn/mean": 2.7008860111236572, "rewards/reward_fn/std": 0.2668880224227905, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 379.5, "completions/mean_terminated_length": 379.5, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.05197836002970192, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.04075531323906034, "learning_rate": 7.8044e-06, "loss": 0.0016, "num_tokens": 21696080.0, "reward": 2.7861926555633545, "reward_std": 0.3229799270629883, "rewards/reward_fn/mean": 2.7861926555633545, "rewards/reward_fn/std": 0.3229798674583435, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 364.46875, "completions/mean_terminated_length": 364.46875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.05208443831547682, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.044861781294457614, "learning_rate": 7.804e-06, "loss": 0.0018, "num_tokens": 21749247.0, "reward": 2.9800658226013184, "reward_std": 0.3406826853752136, "rewards/reward_fn/mean": 2.9800658226013184, "rewards/reward_fn/std": 0.3406826853752136, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 166.28125, "completions/mean_terminated_length": 166.28125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.05219051660125172, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.05009030376095325, "learning_rate": 7.8036e-06, "loss": 0.002, "num_tokens": 21788488.0, "reward": 2.9894442558288574, "reward_std": 0.013725874945521355, "rewards/reward_fn/mean": 2.9894442558288574, "rewards/reward_fn/std": 0.013725854456424713, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 186.28125, "completions/mean_terminated_length": 186.28125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.05229659488702663, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.04771283362060785, "learning_rate": 7.8032e-06, "loss": 0.0019, "num_tokens": 21830353.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 126.03125, "completions/mean_terminated_length": 126.03125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.05240267317280153, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.05836413930228446, "learning_rate": 7.8028e-06, "loss": 0.0023, "num_tokens": 21871282.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 256.28125, "completions/mean_terminated_length": 256.28125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.052508751458576426, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.08196262107230723, "learning_rate": 7.8024e-06, "loss": 0.0033, "num_tokens": 21915227.0, "reward": 3.26692795753479, "reward_std": 0.4324604272842407, "rewards/reward_fn/mean": 3.26692795753479, "rewards/reward_fn/std": 0.4324604272842407, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 201.96875, "completions/mean_terminated_length": 201.96875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.05261482974435133, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.06773088849149644, "learning_rate": 7.802e-06, "loss": 0.0027, "num_tokens": 21968250.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 460.6875, "completions/mean_terminated_length": 460.6875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.05272090803012623, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.04177248413907364, "learning_rate": 7.8016e-06, "loss": 0.0017, "num_tokens": 22026704.0, "reward": 2.7659690380096436, "reward_std": 0.25846555829048157, "rewards/reward_fn/mean": 2.7659690380096436, "rewards/reward_fn/std": 0.25846555829048157, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 604.03125, "completions/mean_terminated_length": 604.03125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.05282698631590114, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.07456690678372979, "learning_rate": 7.801199999999999e-06, "loss": 0.003, "num_tokens": 22081009.0, "reward": 2.7125933170318604, "reward_std": 0.33740636706352234, "rewards/reward_fn/mean": 2.7125933170318604, "rewards/reward_fn/std": 0.3374064266681671, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 268.78125, "completions/mean_terminated_length": 268.78125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.05293306460167604, "frac_reward_zero_std": 1.0, "grad_norm": 0.05908203125, "kl": 0.036616402619984, "learning_rate": 7.8008e-06, "loss": 0.0015, "num_tokens": 22115626.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 254.90625, "completions/mean_terminated_length": 254.90625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.05303914288745094, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.05103367427363992, "learning_rate": 7.800399999999999e-06, "loss": 0.002, "num_tokens": 22166055.0, "reward": 3.9605257511138916, "reward_std": 0.22330042719841003, "rewards/reward_fn/mean": 3.9605257511138916, "rewards/reward_fn/std": 0.22330045700073242, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 57.78125, "completions/mean_terminated_length": 57.78125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.05314522117322584, "frac_reward_zero_std": 1.0, "grad_norm": 0.177734375, "kl": 0.020510083792032674, "learning_rate": 7.8e-06, "loss": 0.0008, "num_tokens": 22186080.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 348.09375, "completions/mean_terminated_length": 348.09375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.05325129945900074, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.06456627510488033, "learning_rate": 7.799599999999999e-06, "loss": 0.0026, "num_tokens": 22238659.0, "reward": 3.9281535148620605, "reward_std": 0.2827524244785309, "rewards/reward_fn/mean": 3.9281535148620605, "rewards/reward_fn/std": 0.2827524244785309, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 160.71875, "completions/mean_terminated_length": 160.71875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.05335737774477564, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.07040900446008891, "learning_rate": 7.7992e-06, "loss": 0.0028, "num_tokens": 22278650.0, "reward": 3.0478758811950684, "reward_std": 0.008108945563435555, "rewards/reward_fn/mean": 3.0478758811950684, "rewards/reward_fn/std": 0.008108980022370815, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 233.65625, "completions/mean_terminated_length": 233.65625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.05346345603055055, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.06506969581823796, "learning_rate": 7.798799999999999e-06, "loss": 0.0026, "num_tokens": 22320079.0, "reward": 3.0991227626800537, "reward_std": 0.24598754942417145, "rewards/reward_fn/mean": 3.0991227626800537, "rewards/reward_fn/std": 0.24598753452301025, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 415.71875, "completions/mean_terminated_length": 415.71875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.05356953431632545, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.04956794378813356, "learning_rate": 7.7984e-06, "loss": 0.002, "num_tokens": 22381862.0, "reward": 3.519956111907959, "reward_std": 0.5920132398605347, "rewards/reward_fn/mean": 3.519956111907959, "rewards/reward_fn/std": 0.5920132398605347, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 229.46875, "completions/mean_terminated_length": 229.46875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.05367561260210035, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.03545730945188552, "learning_rate": 7.797999999999999e-06, "loss": 0.0014, "num_tokens": 22410741.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 264.4375, "completions/mean_terminated_length": 264.4375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.05378169088787525, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.05536438780836761, "learning_rate": 7.7976e-06, "loss": 0.0022, "num_tokens": 22459363.0, "reward": 3.8821825981140137, "reward_std": 0.3202844560146332, "rewards/reward_fn/mean": 3.8821825981140137, "rewards/reward_fn/std": 0.3202844262123108, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 170.90625, "completions/mean_terminated_length": 170.90625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.05388776917365015, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.05457607749849558, "learning_rate": 7.7972e-06, "loss": 0.0022, "num_tokens": 22502848.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 255.75, "completions/mean_terminated_length": 255.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.05399384745942506, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.04397421167232096, "learning_rate": 7.7968e-06, "loss": 0.0018, "num_tokens": 22555672.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 119.90625, "completions/mean_terminated_length": 119.90625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.05409992574519996, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.0700938591035083, "learning_rate": 7.7964e-06, "loss": 0.0028, "num_tokens": 22583125.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 176.21875, "completions/mean_terminated_length": 176.21875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.05420600403097486, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.06420622754376382, "learning_rate": 7.796e-06, "loss": 0.0026, "num_tokens": 22631228.0, "reward": 2.773768424987793, "reward_std": 0.05320408195257187, "rewards/reward_fn/mean": 2.773768424987793, "rewards/reward_fn/std": 0.053204067051410675, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 243.6875, "completions/mean_terminated_length": 243.6875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.05431208231674976, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.05076413287315518, "learning_rate": 7.7956e-06, "loss": 0.002, "num_tokens": 22689362.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 187.03125, "completions/mean_terminated_length": 187.03125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.05441816060252466, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.04727089777588844, "learning_rate": 7.7952e-06, "loss": 0.0019, "num_tokens": 22728115.0, "reward": 2.8927512168884277, "reward_std": 0.29177990555763245, "rewards/reward_fn/mean": 2.8927512168884277, "rewards/reward_fn/std": 0.29177987575531006, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 131.03125, "completions/mean_terminated_length": 131.03125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.05452423888829957, "frac_reward_zero_std": 1.0, "grad_norm": 0.16015625, "kl": 0.06265121378237382, "learning_rate": 7.7948e-06, "loss": 0.0025, "num_tokens": 22761108.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 106.3125, "completions/mean_terminated_length": 106.3125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.05463031717407447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.010751996480394155, "learning_rate": 7.7944e-06, "loss": 0.0004, "num_tokens": 22803678.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 581.75, "completions/mean_terminated_length": 534.4515991210938, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.05473639545984937, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.057617883081547916, "learning_rate": 7.793999999999999e-06, "loss": 0.0023, "num_tokens": 22881142.0, "reward": 3.3143372535705566, "reward_std": 1.1114939451217651, "rewards/reward_fn/mean": 3.3143372535705566, "rewards/reward_fn/std": 1.1114939451217651, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 250.5625, "completions/mean_terminated_length": 250.5625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.05484247374562427, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.05981055251322687, "learning_rate": 7.7936e-06, "loss": 0.0024, "num_tokens": 22919592.0, "reward": 2.810237407684326, "reward_std": 0.2666741907596588, "rewards/reward_fn/mean": 2.810237407684326, "rewards/reward_fn/std": 0.2666742503643036, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 82.75, "completions/mean_terminated_length": 82.75, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.05494855203139917, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.014161759259877726, "learning_rate": 7.793199999999999e-06, "loss": 0.0006, "num_tokens": 22957280.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 151.59375, "completions/mean_terminated_length": 151.59375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.05505463031717407, "frac_reward_zero_std": 1.0, "grad_norm": 0.1123046875, "kl": 0.03626142651773989, "learning_rate": 7.7928e-06, "loss": 0.0015, "num_tokens": 22993619.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 142.53125, "completions/mean_terminated_length": 142.53125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.05516070860294898, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.03131835992098786, "learning_rate": 7.7924e-06, "loss": 0.0013, "num_tokens": 23031588.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 221.3125, "completions/mean_terminated_length": 221.3125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.05526678688872388, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.05314846872352064, "learning_rate": 7.792e-06, "loss": 0.0021, "num_tokens": 23075534.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 186.15625, "completions/mean_terminated_length": 186.15625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.05537286517449878, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.06087932689115405, "learning_rate": 7.7916e-06, "loss": 0.0024, "num_tokens": 23131827.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 508.03125, "completions/mean_terminated_length": 458.3548278808594, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.05547894346027368, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.045374272391200066, "learning_rate": 7.7912e-06, "loss": 0.0018, "num_tokens": 23196692.0, "reward": 2.6854634284973145, "reward_std": 0.5392786860466003, "rewards/reward_fn/mean": 2.6854634284973145, "rewards/reward_fn/std": 0.5392786264419556, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 203.15625, "completions/mean_terminated_length": 203.15625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.05558502174604858, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.05111256393138319, "learning_rate": 7.7908e-06, "loss": 0.002, "num_tokens": 23218713.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 112.96875, "completions/mean_terminated_length": 112.96875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.05569110003182349, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.05130923929391429, "learning_rate": 7.790399999999999e-06, "loss": 0.0021, "num_tokens": 23259032.0, "reward": 3.9281158447265625, "reward_std": 0.4066387414932251, "rewards/reward_fn/mean": 3.9281158447265625, "rewards/reward_fn/std": 0.4066386818885803, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 173.28125, "completions/mean_terminated_length": 173.28125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.05579717831759839, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.0680079300655052, "learning_rate": 7.79e-06, "loss": 0.0027, "num_tokens": 23297473.0, "reward": 2.882020950317383, "reward_std": 0.01828574575483799, "rewards/reward_fn/mean": 2.882020950317383, "rewards/reward_fn/std": 0.01828572154045105, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 455.5, "completions/mean_terminated_length": 455.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.05590325660337329, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.06674273777753115, "learning_rate": 7.789599999999999e-06, "loss": 0.0027, "num_tokens": 23376529.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.056009334889148193, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.04751398117514327, "learning_rate": 7.7892e-06, "loss": 0.0019, "num_tokens": 23429757.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 369.78125, "completions/mean_terminated_length": 369.78125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.05611541317492309, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.05224546114914119, "learning_rate": 7.788799999999999e-06, "loss": 0.0021, "num_tokens": 23480214.0, "reward": 2.8701210021972656, "reward_std": 0.0751161128282547, "rewards/reward_fn/mean": 2.8701210021972656, "rewards/reward_fn/std": 0.0751161202788353, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 263.3125, "completions/mean_terminated_length": 263.3125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.05622149146069799, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.03188524296274409, "learning_rate": 7.7884e-06, "loss": 0.0013, "num_tokens": 23523424.0, "reward": 2.912999153137207, "reward_std": 0.01211194321513176, "rewards/reward_fn/mean": 2.912999153137207, "rewards/reward_fn/std": 0.012111921794712543, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 343.875, "completions/mean_terminated_length": 343.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.0563275697464729, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.06691422103904188, "learning_rate": 7.788e-06, "loss": 0.0027, "num_tokens": 23583132.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 264.15625, "completions/mean_terminated_length": 264.15625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.0564336480322478, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.041861614503432065, "learning_rate": 7.7876e-06, "loss": 0.0017, "num_tokens": 23625217.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 290.375, "completions/mean_terminated_length": 290.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.056539726318022704, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.045831574767362326, "learning_rate": 7.7872e-06, "loss": 0.0018, "num_tokens": 23666861.0, "reward": 3.0297441482543945, "reward_std": 0.04320033639669418, "rewards/reward_fn/mean": 3.0297441482543945, "rewards/reward_fn/std": 0.04320032149553299, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 250.34375, "completions/mean_terminated_length": 250.34375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.0566458046037976, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.03298926952993497, "learning_rate": 7.7868e-06, "loss": 0.0013, "num_tokens": 23695864.0, "reward": 2.8847732543945312, "reward_std": 0.0448918342590332, "rewards/reward_fn/mean": 2.8847732543945312, "rewards/reward_fn/std": 0.0448918342590332, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 173.5625, "completions/mean_terminated_length": 173.5625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.0567518828895725, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.04681810602778569, "learning_rate": 7.7864e-06, "loss": 0.0019, "num_tokens": 23729130.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 305.8125, "completions/mean_terminated_length": 305.8125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.05685796117534741, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.0402856832370162, "learning_rate": 7.786e-06, "loss": 0.0016, "num_tokens": 23804964.0, "reward": 2.9268569946289062, "reward_std": 0.19753608107566833, "rewards/reward_fn/mean": 2.9268569946289062, "rewards/reward_fn/std": 0.19753602147102356, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 260.5, "completions/mean_terminated_length": 260.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.05696403946112231, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.0687692086212337, "learning_rate": 7.785599999999999e-06, "loss": 0.0028, "num_tokens": 23861652.0, "reward": 3.521317958831787, "reward_std": 0.48663264513015747, "rewards/reward_fn/mean": 3.521317958831787, "rewards/reward_fn/std": 0.48663264513015747, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 205.65625, "completions/mean_terminated_length": 205.65625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.05707011774689721, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.03987734788097441, "learning_rate": 7.7852e-06, "loss": 0.0016, "num_tokens": 23889033.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 211.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.057176196032672114, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.04420925548765808, "learning_rate": 7.784799999999999e-06, "loss": 0.0018, "num_tokens": 23942509.0, "reward": 3.1174285411834717, "reward_std": 0.07023604959249496, "rewards/reward_fn/mean": 3.1174285411834717, "rewards/reward_fn/std": 0.07023605704307556, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 396.53125, "completions/mean_terminated_length": 396.53125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.05728227431844701, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.030567975947633386, "learning_rate": 7.7844e-06, "loss": 0.0012, "num_tokens": 23992286.0, "reward": 3.864678382873535, "reward_std": 0.4610091745853424, "rewards/reward_fn/mean": 3.864678382873535, "rewards/reward_fn/std": 0.46100914478302, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 293.46875, "completions/mean_terminated_length": 293.46875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.05738835260422192, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.038554588274564594, "learning_rate": 7.783999999999999e-06, "loss": 0.0015, "num_tokens": 24036237.0, "reward": 3.5523862838745117, "reward_std": 0.5504615902900696, "rewards/reward_fn/mean": 3.5523862838745117, "rewards/reward_fn/std": 0.5504615902900696, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 262.75, "completions/mean_terminated_length": 262.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.05749443088999682, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.050088691408745944, "learning_rate": 7.7836e-06, "loss": 0.002, "num_tokens": 24083557.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 428.96875, "completions/mean_terminated_length": 428.96875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.05760050917577172, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.03457546600839123, "learning_rate": 7.7832e-06, "loss": 0.0014, "num_tokens": 24142052.0, "reward": 3.9279990196228027, "reward_std": 0.4072989225387573, "rewards/reward_fn/mean": 3.9279990196228027, "rewards/reward_fn/std": 0.40729889273643494, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 417.9375, "completions/mean_terminated_length": 417.9375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.057706587461546624, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.032870883820578456, "learning_rate": 7.7828e-06, "loss": 0.0013, "num_tokens": 24169410.0, "reward": 3.8196969032287598, "reward_std": 0.4258030652999878, "rewards/reward_fn/mean": 3.8196969032287598, "rewards/reward_fn/std": 0.4258030652999878, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 387.71875, "completions/mean_terminated_length": 387.71875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.05781266574732152, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.039335515175480396, "learning_rate": 7.7824e-06, "loss": 0.0016, "num_tokens": 24228249.0, "reward": 2.525263786315918, "reward_std": 0.4515567123889923, "rewards/reward_fn/mean": 2.525263786315918, "rewards/reward_fn/std": 0.4515567123889923, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 119.03125, "completions/mean_terminated_length": 119.03125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.05791874403309642, "frac_reward_zero_std": 1.0, "grad_norm": 0.2392578125, "kl": 0.053942128957714885, "learning_rate": 7.782e-06, "loss": 0.0022, "num_tokens": 24275610.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 98.4375, "completions/mean_terminated_length": 98.4375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.05802482231887133, "frac_reward_zero_std": 1.0, "grad_norm": 0.2412109375, "kl": 0.03568551503121853, "learning_rate": 7.7816e-06, "loss": 0.0014, "num_tokens": 24318056.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 430.75, "completions/mean_terminated_length": 430.75, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.05813090060464623, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.054423797759227455, "learning_rate": 7.7812e-06, "loss": 0.0022, "num_tokens": 24351296.0, "reward": 3.9609341621398926, "reward_std": 0.2209891378879547, "rewards/reward_fn/mean": 3.9609341621398926, "rewards/reward_fn/std": 0.2209891378879547, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 230.28125, "completions/mean_terminated_length": 230.28125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.05823697889042113, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.05803138192277402, "learning_rate": 7.7808e-06, "loss": 0.0023, "num_tokens": 24378729.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 222.21875, "completions/mean_terminated_length": 222.21875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.058343057176196034, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.034262404195033014, "learning_rate": 7.7804e-06, "loss": 0.0014, "num_tokens": 24421136.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 291.8125, "completions/mean_terminated_length": 291.8125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.05844913546197093, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.054334802902303636, "learning_rate": 7.78e-06, "loss": 0.0022, "num_tokens": 24486218.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 181.125, "completions/mean_terminated_length": 181.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.05855521374774584, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.033949769916944206, "learning_rate": 7.7796e-06, "loss": 0.0014, "num_tokens": 24523054.0, "reward": 3.0280110836029053, "reward_std": 0.04787445068359375, "rewards/reward_fn/mean": 3.0280110836029053, "rewards/reward_fn/std": 0.04787447676062584, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 317.1875, "completions/mean_terminated_length": 317.1875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.05866129203352074, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.03886931930901483, "learning_rate": 7.7792e-06, "loss": 0.0016, "num_tokens": 24572404.0, "reward": 3.0145347118377686, "reward_std": 0.04721865430474281, "rewards/reward_fn/mean": 3.0145347118377686, "rewards/reward_fn/std": 0.04721866548061371, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 251.03125, "completions/mean_terminated_length": 251.03125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.05876737031929564, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.044029625481925905, "learning_rate": 7.7788e-06, "loss": 0.0018, "num_tokens": 24616533.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 202.03125, "completions/mean_terminated_length": 202.03125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.058873448605070544, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.06479659979231656, "learning_rate": 7.7784e-06, "loss": 0.0026, "num_tokens": 24656406.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 188.90625, "completions/mean_terminated_length": 188.90625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.05897952689084544, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.056396145024336874, "learning_rate": 7.777999999999999e-06, "loss": 0.0023, "num_tokens": 24704339.0, "reward": 2.929412603378296, "reward_std": 0.2831609547138214, "rewards/reward_fn/mean": 2.929412603378296, "rewards/reward_fn/std": 0.283160924911499, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.05908560517662034, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.04244612640468404, "learning_rate": 7.7776e-06, "loss": 0.0017, "num_tokens": 24742575.0, "reward": 3.569821834564209, "reward_std": 0.5647028088569641, "rewards/reward_fn/mean": 3.569821834564209, "rewards/reward_fn/std": 0.5647028088569641, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 229.78125, "completions/mean_terminated_length": 229.78125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.05919168346239525, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0631415651878342, "learning_rate": 7.777199999999999e-06, "loss": 0.0025, "num_tokens": 24788456.0, "reward": 3.678088903427124, "reward_std": 0.485332190990448, "rewards/reward_fn/mean": 3.678088903427124, "rewards/reward_fn/std": 0.4853322207927704, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.05929776174817015, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.03941875655436888, "learning_rate": 7.7768e-06, "loss": 0.0016, "num_tokens": 24832360.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 185.96875, "completions/mean_terminated_length": 185.96875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.059403840033945055, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.05661877314560115, "learning_rate": 7.776399999999999e-06, "loss": 0.0023, "num_tokens": 24873159.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.059509918319719954, "frac_reward_zero_std": 1.0, "grad_norm": 0.189453125, "kl": 0.03810377966146916, "learning_rate": 7.776e-06, "loss": 0.0015, "num_tokens": 24916487.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 174.125, "completions/mean_terminated_length": 174.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.05961599660549485, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.05429914325941354, "learning_rate": 7.775599999999999e-06, "loss": 0.0022, "num_tokens": 24958379.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 296.3125, "completions/mean_terminated_length": 296.3125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.05972207489126976, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.04550172860035673, "learning_rate": 7.7752e-06, "loss": 0.0018, "num_tokens": 24999541.0, "reward": 3.886976957321167, "reward_std": 0.3571126461029053, "rewards/reward_fn/mean": 3.886976957321167, "rewards/reward_fn/std": 0.3571126461029053, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 151.5, "completions/mean_terminated_length": 151.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.05982815317704466, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.03715689940145239, "learning_rate": 7.774799999999999e-06, "loss": 0.0015, "num_tokens": 25028357.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 222.40625, "completions/mean_terminated_length": 222.40625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.05993423146281956, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.04165405425010249, "learning_rate": 7.7744e-06, "loss": 0.0017, "num_tokens": 25073714.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 361.90625, "completions/mean_terminated_length": 361.90625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.060040309748594464, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.05564211937598884, "learning_rate": 7.774e-06, "loss": 0.0022, "num_tokens": 25129679.0, "reward": 3.9234397411346436, "reward_std": 0.24193021655082703, "rewards/reward_fn/mean": 3.9234397411346436, "rewards/reward_fn/std": 0.24193023145198822, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 294.1875, "completions/mean_terminated_length": 294.1875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.060146388034369364, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.036501555470749736, "learning_rate": 7.7736e-06, "loss": 0.0015, "num_tokens": 25179445.0, "reward": 2.908540725708008, "reward_std": 0.22343216836452484, "rewards/reward_fn/mean": 2.908540725708008, "rewards/reward_fn/std": 0.22343213856220245, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 301.9375, "completions/mean_terminated_length": 301.9375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.06025246632014427, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.054916275781579316, "learning_rate": 7.7732e-06, "loss": 0.0022, "num_tokens": 25227891.0, "reward": 3.2726054191589355, "reward_std": 0.4647665023803711, "rewards/reward_fn/mean": 3.2726054191589355, "rewards/reward_fn/std": 0.4647665023803711, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 174.59375, "completions/mean_terminated_length": 174.59375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.06035854460591917, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.05450990307144821, "learning_rate": 7.7728e-06, "loss": 0.0022, "num_tokens": 25276134.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 283.8125, "completions/mean_terminated_length": 283.8125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.06046462289169407, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.026582436956232414, "learning_rate": 7.7724e-06, "loss": 0.0011, "num_tokens": 25328416.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 701.71875, "completions/mean_terminated_length": 562.4483032226562, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.060570701177468975, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0481034837430343, "learning_rate": 7.772e-06, "loss": 0.0019, "num_tokens": 25390199.0, "reward": 2.5434768199920654, "reward_std": 0.8583298325538635, "rewards/reward_fn/mean": 2.5434768199920654, "rewards/reward_fn/std": 0.8583298325538635, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 558.1875, "completions/mean_terminated_length": 558.1875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.060676779463243874, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.036773104628082365, "learning_rate": 7.7716e-06, "loss": 0.0015, "num_tokens": 25444829.0, "reward": 2.9064252376556396, "reward_std": 0.025962000712752342, "rewards/reward_fn/mean": 2.9064252376556396, "rewards/reward_fn/std": 0.02596198581159115, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 72.3125, "completions/mean_terminated_length": 72.3125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.06078285774901877, "frac_reward_zero_std": 1.0, "grad_norm": 0.193359375, "kl": 0.015629198125679977, "learning_rate": 7.7712e-06, "loss": 0.0006, "num_tokens": 25481415.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 404.09375, "completions/mean_terminated_length": 404.09375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.06088893603479368, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.03797735116677359, "learning_rate": 7.7708e-06, "loss": 0.0015, "num_tokens": 25538282.0, "reward": 3.007188320159912, "reward_std": 0.32522156834602356, "rewards/reward_fn/mean": 3.007188320159912, "rewards/reward_fn/std": 0.32522156834602356, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 187.96875, "completions/mean_terminated_length": 187.96875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.06099501432056858, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.03846541151870042, "learning_rate": 7.7704e-06, "loss": 0.0015, "num_tokens": 25579145.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 139.375, "completions/mean_terminated_length": 139.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.06110109260634348, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.034549258183687925, "learning_rate": 7.769999999999998e-06, "loss": 0.0014, "num_tokens": 25626293.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.061207170892118384, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.056792226212564856, "learning_rate": 7.7696e-06, "loss": 0.0023, "num_tokens": 25677611.0, "reward": 2.673492431640625, "reward_std": 0.05745452642440796, "rewards/reward_fn/mean": 2.673492431640625, "rewards/reward_fn/std": 0.05745454132556915, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 250.96875, "completions/mean_terminated_length": 250.96875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.061313249177893284, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0328554887091741, "learning_rate": 7.7692e-06, "loss": 0.0013, "num_tokens": 25717642.0, "reward": 2.903965950012207, "reward_std": 0.3777007758617401, "rewards/reward_fn/mean": 2.903965950012207, "rewards/reward_fn/std": 0.3777008056640625, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 135.28125, "completions/mean_terminated_length": 135.28125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.06141932746366819, "frac_reward_zero_std": 1.0, "grad_norm": 0.2392578125, "kl": 0.03916344471508637, "learning_rate": 7.7688e-06, "loss": 0.0016, "num_tokens": 25754707.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 530.03125, "completions/mean_terminated_length": 530.03125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.06152540574944309, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.07856652745977044, "learning_rate": 7.7684e-06, "loss": 0.0031, "num_tokens": 25819956.0, "reward": 3.016455888748169, "reward_std": 0.26328423619270325, "rewards/reward_fn/mean": 3.016455888748169, "rewards/reward_fn/std": 0.26328420639038086, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.06163148403521799, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.04260317754233256, "learning_rate": 7.767999999999999e-06, "loss": 0.0017, "num_tokens": 25859212.0, "reward": 3.762989044189453, "reward_std": 0.502149224281311, "rewards/reward_fn/mean": 3.762989044189453, "rewards/reward_fn/std": 0.5021491646766663, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 305.0, "completions/mean_terminated_length": 305.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.061737562320992895, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.05852566647808999, "learning_rate": 7.7676e-06, "loss": 0.0023, "num_tokens": 25897740.0, "reward": 2.340458393096924, "reward_std": 0.49268436431884766, "rewards/reward_fn/mean": 2.340458393096924, "rewards/reward_fn/std": 0.49268433451652527, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 104.40625, "completions/mean_terminated_length": 104.40625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.061843640606767794, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.03926274285186082, "learning_rate": 7.767199999999999e-06, "loss": 0.0016, "num_tokens": 25931033.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 290.46875, "completions/mean_terminated_length": 290.46875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.06194971889254269, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.046176372619811445, "learning_rate": 7.7668e-06, "loss": 0.0018, "num_tokens": 25981544.0, "reward": 3.8755037784576416, "reward_std": 0.39327624440193176, "rewards/reward_fn/mean": 3.8755037784576416, "rewards/reward_fn/std": 0.3932762145996094, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 223.1875, "completions/mean_terminated_length": 223.1875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.0620557971783176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.021591511933365837, "learning_rate": 7.766399999999999e-06, "loss": 0.0009, "num_tokens": 26032654.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 282.6875, "completions/mean_terminated_length": 282.6875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.0621618754640925, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.03477652568835765, "learning_rate": 7.766e-06, "loss": 0.0014, "num_tokens": 26062148.0, "reward": 3.8180346488952637, "reward_std": 0.4300572872161865, "rewards/reward_fn/mean": 3.8180346488952637, "rewards/reward_fn/std": 0.43005725741386414, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 296.3125, "completions/mean_terminated_length": 296.3125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.062267953749867405, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.0395249537541531, "learning_rate": 7.765599999999999e-06, "loss": 0.0016, "num_tokens": 26102062.0, "reward": 2.6568655967712402, "reward_std": 0.18853385746479034, "rewards/reward_fn/mean": 2.6568655967712402, "rewards/reward_fn/std": 0.18853387236595154, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 219.03125, "completions/mean_terminated_length": 219.03125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.062374032035642304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.04510620346991345, "learning_rate": 7.7652e-06, "loss": 0.0018, "num_tokens": 26147535.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 163.71875, "completions/mean_terminated_length": 163.71875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.062480110321417204, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.050036760105285794, "learning_rate": 7.7648e-06, "loss": 0.002, "num_tokens": 26192390.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 140.8125, "completions/mean_terminated_length": 140.8125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.0625861886071921, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.022869454056490213, "learning_rate": 7.7644e-06, "loss": 0.0009, "num_tokens": 26232288.0, "reward": 3.9345054626464844, "reward_std": 0.2577376961708069, "rewards/reward_fn/mean": 3.9345054626464844, "rewards/reward_fn/std": 0.2577377259731293, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.06269226689296702, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.026837114244699478, "learning_rate": 7.764e-06, "loss": 0.0011, "num_tokens": 26272428.0, "reward": 3.941838264465332, "reward_std": 0.22888779640197754, "rewards/reward_fn/mean": 3.941838264465332, "rewards/reward_fn/std": 0.22888781130313873, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 174.46875, "completions/mean_terminated_length": 174.46875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.06279834517874192, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.025287829048465937, "learning_rate": 7.7636e-06, "loss": 0.001, "num_tokens": 26315963.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 203.28125, "completions/mean_terminated_length": 203.28125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.06290442346451681, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.0472224080003798, "learning_rate": 7.7632e-06, "loss": 0.0019, "num_tokens": 26370724.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 172.59375, "completions/mean_terminated_length": 172.59375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.06301050175029171, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.03370702202664688, "learning_rate": 7.7628e-06, "loss": 0.0013, "num_tokens": 26414359.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 268.21875, "completions/mean_terminated_length": 268.21875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.06311658003606661, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.03185254998970777, "learning_rate": 7.7624e-06, "loss": 0.0013, "num_tokens": 26443774.0, "reward": 3.669276714324951, "reward_std": 0.4985271990299225, "rewards/reward_fn/mean": 3.669276714324951, "rewards/reward_fn/std": 0.4985271692276001, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 326.03125, "completions/mean_terminated_length": 326.03125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.06322265832184151, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.031429233436938375, "learning_rate": 7.762e-06, "loss": 0.0013, "num_tokens": 26473759.0, "reward": 2.7664852142333984, "reward_std": 0.09329904615879059, "rewards/reward_fn/mean": 2.7664852142333984, "rewards/reward_fn/std": 0.0932990238070488, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 267.28125, "completions/mean_terminated_length": 267.28125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.06332873660761643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.03974258748348802, "learning_rate": 7.761599999999999e-06, "loss": 0.0016, "num_tokens": 26517672.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 239.5625, "completions/mean_terminated_length": 239.5625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.06343481489339133, "frac_reward_zero_std": 1.0, "grad_norm": 0.05908203125, "kl": 0.018108745745848864, "learning_rate": 7.7612e-06, "loss": 0.0007, "num_tokens": 26570458.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 254.78125, "completions/mean_terminated_length": 254.78125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.06354089317916622, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.030075128830503672, "learning_rate": 7.760799999999999e-06, "loss": 0.0012, "num_tokens": 26614131.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 497.65625, "completions/mean_terminated_length": 497.65625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.06364697146494112, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.041799441270995885, "learning_rate": 7.7604e-06, "loss": 0.0017, "num_tokens": 26679752.0, "reward": 2.796441078186035, "reward_std": 0.0220673605799675, "rewards/reward_fn/mean": 2.796441078186035, "rewards/reward_fn/std": 0.022067388519644737, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 235.3125, "completions/mean_terminated_length": 235.3125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.06375304975071602, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.02781981969019398, "learning_rate": 7.76e-06, "loss": 0.0011, "num_tokens": 26732018.0, "reward": 2.8756937980651855, "reward_std": 0.03420599177479744, "rewards/reward_fn/mean": 2.8756937980651855, "rewards/reward_fn/std": 0.03420599177479744, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 292.4375, "completions/mean_terminated_length": 292.4375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.06385912803649094, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.04630755807738751, "learning_rate": 7.7596e-06, "loss": 0.0019, "num_tokens": 26781184.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 514.8125, "completions/mean_terminated_length": 514.8125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.06396520632226584, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0662745222216472, "learning_rate": 7.7592e-06, "loss": 0.0027, "num_tokens": 26832442.0, "reward": 3.023439884185791, "reward_std": 0.4279276430606842, "rewards/reward_fn/mean": 3.023439884185791, "rewards/reward_fn/std": 0.427927702665329, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 279.0625, "completions/mean_terminated_length": 279.0625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.06407128460804073, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.03080717130796984, "learning_rate": 7.7588e-06, "loss": 0.0012, "num_tokens": 26855164.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.06417736289381563, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.060264868661761284, "learning_rate": 7.7584e-06, "loss": 0.0024, "num_tokens": 26895832.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.06428344117959053, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.030933504458516836, "learning_rate": 7.758e-06, "loss": 0.0012, "num_tokens": 26934828.0, "reward": 2.9522957801818848, "reward_std": 0.016364410519599915, "rewards/reward_fn/mean": 2.9522957801818848, "rewards/reward_fn/std": 0.016364362090826035, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 228.875, "completions/mean_terminated_length": 228.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.06438951946536543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.024772782082436606, "learning_rate": 7.7576e-06, "loss": 0.001, "num_tokens": 26981256.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1384.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 412.75, "completions/mean_terminated_length": 412.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.06449559775114035, "frac_reward_zero_std": 0.0, "grad_norm": 4.0625, "kl": 0.08590105758048594, "learning_rate": 7.7572e-06, "loss": 0.0034, "num_tokens": 27028832.0, "reward": 3.3276102542877197, "reward_std": 0.4957602918148041, "rewards/reward_fn/mean": 3.3276102542877197, "rewards/reward_fn/std": 0.49576032161712646, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 252.8125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.06460167603691525, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.03629728313535452, "learning_rate": 7.7568e-06, "loss": 0.0015, "num_tokens": 27066938.0, "reward": 3.7472081184387207, "reward_std": 0.4863166809082031, "rewards/reward_fn/mean": 3.7472081184387207, "rewards/reward_fn/std": 0.48631665110588074, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 227.8125, "completions/mean_terminated_length": 227.8125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.06470775432269014, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.024877535848645493, "learning_rate": 7.756399999999999e-06, "loss": 0.001, "num_tokens": 27121588.0, "reward": 1.737754464149475, "reward_std": 0.02460699900984764, "rewards/reward_fn/mean": 1.737754464149475, "rewards/reward_fn/std": 0.02460700459778309, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 243.65625, "completions/mean_terminated_length": 243.65625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.06481383260846504, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.0540953372255899, "learning_rate": 7.756e-06, "loss": 0.0022, "num_tokens": 27163785.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 342.71875, "completions/mean_terminated_length": 342.71875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.06491991089423994, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.05290446960134432, "learning_rate": 7.7556e-06, "loss": 0.0021, "num_tokens": 27207776.0, "reward": 3.002070903778076, "reward_std": 0.05182076618075371, "rewards/reward_fn/mean": 3.002070903778076, "rewards/reward_fn/std": 0.051820818334817886, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 407.4375, "completions/mean_terminated_length": 407.4375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.06502598918001486, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.07026238366961479, "learning_rate": 7.7552e-06, "loss": 0.0028, "num_tokens": 27259982.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 292.0625, "completions/mean_terminated_length": 292.0625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.06513206746578976, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.05310130078578368, "learning_rate": 7.7548e-06, "loss": 0.0021, "num_tokens": 27315440.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 221.6875, "completions/mean_terminated_length": 221.6875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.06523814575156466, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.05479395019938238, "learning_rate": 7.7544e-06, "loss": 0.0022, "num_tokens": 27351750.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 244.5625, "completions/mean_terminated_length": 244.5625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.06534422403733955, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.0540774236433208, "learning_rate": 7.753999999999999e-06, "loss": 0.0022, "num_tokens": 27398040.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 231.6875, "completions/mean_terminated_length": 231.6875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.06545030232311445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.041808648442383856, "learning_rate": 7.7536e-06, "loss": 0.0017, "num_tokens": 27457646.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1549.90625, "completions/mean_terminated_length": 1209.105224609375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.06555638060888937, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.04176906222710386, "learning_rate": 7.753199999999999e-06, "loss": 0.0017, "num_tokens": 27542891.0, "reward": 1.575758934020996, "reward_std": 1.2176213264465332, "rewards/reward_fn/mean": 1.575758934020996, "rewards/reward_fn/std": 1.2176213264465332, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 389.1875, "completions/mean_terminated_length": 389.1875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.06566245889466427, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.059505374170839787, "learning_rate": 7.7528e-06, "loss": 0.0024, "num_tokens": 27574737.0, "reward": 3.0435049533843994, "reward_std": 0.057892631739377975, "rewards/reward_fn/mean": 3.0435049533843994, "rewards/reward_fn/std": 0.05789259821176529, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 348.15625, "completions/mean_terminated_length": 348.15625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.06576853718043917, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.07023045897949487, "learning_rate": 7.752399999999999e-06, "loss": 0.0028, "num_tokens": 27615606.0, "reward": 3.835360050201416, "reward_std": 0.3887580633163452, "rewards/reward_fn/mean": 3.835360050201416, "rewards/reward_fn/std": 0.3887580633163452, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 335.5, "completions/mean_terminated_length": 335.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.06587461546621406, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.05529448727611452, "learning_rate": 7.752e-06, "loss": 0.0022, "num_tokens": 27672454.0, "reward": 2.523176670074463, "reward_std": 1.0861042737960815, "rewards/reward_fn/mean": 2.523176670074463, "rewards/reward_fn/std": 1.0861042737960815, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 435.96875, "completions/mean_terminated_length": 435.96875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.06598069375198896, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.05438588681863621, "learning_rate": 7.751599999999999e-06, "loss": 0.0022, "num_tokens": 27726629.0, "reward": 3.077277421951294, "reward_std": 0.45385316014289856, "rewards/reward_fn/mean": 3.077277421951294, "rewards/reward_fn/std": 0.45385313034057617, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 572.25, "completions/mean_terminated_length": 524.6451416015625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.06608677203776386, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.06977469654520974, "learning_rate": 7.7512e-06, "loss": 0.0028, "num_tokens": 27781901.0, "reward": 3.875, "reward_std": 0.7071067690849304, "rewards/reward_fn/mean": 3.875, "rewards/reward_fn/std": 0.7071067690849304, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 238.21875, "completions/mean_terminated_length": 238.21875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.06619285032353878, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.05847651744261384, "learning_rate": 7.7508e-06, "loss": 0.0023, "num_tokens": 27835924.0, "reward": 3.654426336288452, "reward_std": 0.3738429844379425, "rewards/reward_fn/mean": 3.654426336288452, "rewards/reward_fn/std": 0.3738429844379425, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 512.5625, "completions/mean_terminated_length": 463.0322570800781, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.06629892860931368, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.06687481026165187, "learning_rate": 7.7504e-06, "loss": 0.0027, "num_tokens": 27882566.0, "reward": 2.687096357345581, "reward_std": 0.513810932636261, "rewards/reward_fn/mean": 2.687096357345581, "rewards/reward_fn/std": 0.5138109922409058, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 325.40625, "completions/mean_terminated_length": 325.40625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.06640500689508858, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.04519660054938868, "learning_rate": 7.75e-06, "loss": 0.0018, "num_tokens": 27915059.0, "reward": 3.95910382270813, "reward_std": 0.23134386539459229, "rewards/reward_fn/mean": 3.95910382270813, "rewards/reward_fn/std": 0.23134388029575348, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1281.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 377.1875, "completions/mean_terminated_length": 377.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.06651108518086347, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.051734329434111714, "learning_rate": 7.7496e-06, "loss": 0.0021, "num_tokens": 27972953.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 169.21875, "completions/mean_terminated_length": 169.21875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.06661716346663837, "frac_reward_zero_std": 0.0, "grad_norm": 3.359375, "kl": 0.046305317111546174, "learning_rate": 7.7492e-06, "loss": 0.0019, "num_tokens": 28016192.0, "reward": 3.564767837524414, "reward_std": 0.5709716081619263, "rewards/reward_fn/mean": 3.564767837524414, "rewards/reward_fn/std": 0.5709716081619263, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 299.875, "completions/mean_terminated_length": 299.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.06672324175241329, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.05322365625761449, "learning_rate": 7.7488e-06, "loss": 0.0021, "num_tokens": 28064860.0, "reward": 2.887842893600464, "reward_std": 0.03686607629060745, "rewards/reward_fn/mean": 2.887842893600464, "rewards/reward_fn/std": 0.036866072565317154, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 111.1875, "completions/mean_terminated_length": 111.1875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.06682932003818819, "frac_reward_zero_std": 1.0, "grad_norm": 0.25, "kl": 0.040750893735093996, "learning_rate": 7.7484e-06, "loss": 0.0016, "num_tokens": 28108034.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 157.78125, "completions/mean_terminated_length": 157.78125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.06693539832396309, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.029879302775952965, "learning_rate": 7.748e-06, "loss": 0.0012, "num_tokens": 28145371.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 264.9375, "completions/mean_terminated_length": 264.9375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.06704147660973798, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.03904081700602546, "learning_rate": 7.7476e-06, "loss": 0.0016, "num_tokens": 28196857.0, "reward": 2.8503220081329346, "reward_std": 0.020154688507318497, "rewards/reward_fn/mean": 2.8503220081329346, "rewards/reward_fn/std": 0.02015470154583454, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 227.28125, "completions/mean_terminated_length": 227.28125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.06714755489551288, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.05340335750952363, "learning_rate": 7.7472e-06, "loss": 0.0021, "num_tokens": 28235810.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 151.9375, "completions/mean_terminated_length": 151.9375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.06725363318128778, "frac_reward_zero_std": 0.0, "grad_norm": 3.40625, "kl": 0.046266791701782495, "learning_rate": 7.7468e-06, "loss": 0.0019, "num_tokens": 28288096.0, "reward": 2.819547653198242, "reward_std": 0.035231560468673706, "rewards/reward_fn/mean": 2.819547653198242, "rewards/reward_fn/std": 0.03523159399628639, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 149.90625, "completions/mean_terminated_length": 149.90625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.0673597114670627, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.02690366155002266, "learning_rate": 7.7464e-06, "loss": 0.0011, "num_tokens": 28347933.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 226.59375, "completions/mean_terminated_length": 226.59375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.0674657897528376, "frac_reward_zero_std": 1.0, "grad_norm": 0.1220703125, "kl": 0.047625879873521626, "learning_rate": 7.746e-06, "loss": 0.0019, "num_tokens": 28397328.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 232.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.0675718680386125, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.054383948212489486, "learning_rate": 7.7456e-06, "loss": 0.0022, "num_tokens": 28443652.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.0676779463243874, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.044864116644021124, "learning_rate": 7.7452e-06, "loss": 0.0018, "num_tokens": 28477256.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 143.3125, "completions/mean_terminated_length": 143.3125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.0677840246101623, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.03261691506486386, "learning_rate": 7.744799999999999e-06, "loss": 0.0013, "num_tokens": 28518738.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 139.09375, "completions/mean_terminated_length": 139.09375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.06789010289593721, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.04600106261204928, "learning_rate": 7.7444e-06, "loss": 0.0018, "num_tokens": 28566805.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 153.59375, "completions/mean_terminated_length": 153.59375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.0679961811817121, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.05108794302213937, "learning_rate": 7.743999999999999e-06, "loss": 0.002, "num_tokens": 28627016.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 365.1875, "completions/mean_terminated_length": 365.1875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.068102259467487, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.02875410852720961, "learning_rate": 7.7436e-06, "loss": 0.0011, "num_tokens": 28676878.0, "reward": 2.87377667427063, "reward_std": 0.03773174062371254, "rewards/reward_fn/mean": 2.87377667427063, "rewards/reward_fn/std": 0.03773171827197075, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1325.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 715.3125, "completions/mean_terminated_length": 715.3125, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.0682083377532619, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.06581689266022295, "learning_rate": 7.743199999999999e-06, "loss": 0.0026, "num_tokens": 28736984.0, "reward": 2.59894061088562, "reward_std": 0.3901492953300476, "rewards/reward_fn/mean": 2.59894061088562, "rewards/reward_fn/std": 0.3901492655277252, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 105.78125, "completions/mean_terminated_length": 105.78125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.0683144160390368, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.03504444262944162, "learning_rate": 7.7428e-06, "loss": 0.0014, "num_tokens": 28773137.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 179.84375, "completions/mean_terminated_length": 179.84375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.06842049432481172, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.04245975334197283, "learning_rate": 7.742399999999999e-06, "loss": 0.0017, "num_tokens": 28818060.0, "reward": 3.962952136993408, "reward_std": 0.20957522094249725, "rewards/reward_fn/mean": 3.962952136993408, "rewards/reward_fn/std": 0.20957525074481964, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 336.9375, "completions/mean_terminated_length": 336.9375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.06852657261058662, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.0532979853451252, "learning_rate": 7.742e-06, "loss": 0.0021, "num_tokens": 28857322.0, "reward": 3.685168743133545, "reward_std": 0.5120775699615479, "rewards/reward_fn/mean": 3.685168743133545, "rewards/reward_fn/std": 0.5120775699615479, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 142.03125, "completions/mean_terminated_length": 142.03125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.06863265089636152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.023056287580402568, "learning_rate": 7.741599999999999e-06, "loss": 0.0009, "num_tokens": 28894699.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 226.34375, "completions/mean_terminated_length": 226.34375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.06873872918213642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.04996389290317893, "learning_rate": 7.7412e-06, "loss": 0.002, "num_tokens": 28938166.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 266.90625, "completions/mean_terminated_length": 266.90625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.06884480746791131, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.03471590060507879, "learning_rate": 7.7408e-06, "loss": 0.0014, "num_tokens": 28994931.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 131.625, "completions/mean_terminated_length": 131.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.06895088575368621, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.031769126711878926, "learning_rate": 7.7404e-06, "loss": 0.0013, "num_tokens": 29029703.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 86.0, "completions/mean_terminated_length": 86.0, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.06905696403946113, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.041167730116285384, "learning_rate": 7.74e-06, "loss": 0.0017, "num_tokens": 29050727.0, "reward": 3.076449155807495, "reward_std": 0.009099267423152924, "rewards/reward_fn/mean": 3.076449155807495, "rewards/reward_fn/std": 0.009099281392991543, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 264.125, "completions/mean_terminated_length": 264.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.06916304232523603, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.045598643948324025, "learning_rate": 7.7396e-06, "loss": 0.0018, "num_tokens": 29114539.0, "reward": 2.8244357109069824, "reward_std": 0.03792543336749077, "rewards/reward_fn/mean": 2.8244357109069824, "rewards/reward_fn/std": 0.03792539983987808, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 229.1875, "completions/mean_terminated_length": 229.1875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.06926912061101093, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.043347687751520425, "learning_rate": 7.7392e-06, "loss": 0.0017, "num_tokens": 29152081.0, "reward": 3.718015670776367, "reward_std": 0.42512795329093933, "rewards/reward_fn/mean": 3.718015670776367, "rewards/reward_fn/std": 0.4251279830932617, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.06937519889678583, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0510714779375121, "learning_rate": 7.7388e-06, "loss": 0.002, "num_tokens": 29194725.0, "reward": 3.521083354949951, "reward_std": 0.7184927463531494, "rewards/reward_fn/mean": 3.521083354949951, "rewards/reward_fn/std": 0.7184926867485046, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 318.0, "completions/mean_terminated_length": 318.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.06948127718256072, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.05452185485046357, "learning_rate": 7.7384e-06, "loss": 0.0022, "num_tokens": 29241221.0, "reward": 3.9632177352905273, "reward_std": 0.2080727219581604, "rewards/reward_fn/mean": 3.9632177352905273, "rewards/reward_fn/std": 0.2080727070569992, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 131.0625, "completions/mean_terminated_length": 131.0625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.06958735546833564, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "kl": 0.0322262124682311, "learning_rate": 7.738e-06, "loss": 0.0013, "num_tokens": 29278951.0, "reward": 2.871333599090576, "reward_std": 0.014810092747211456, "rewards/reward_fn/mean": 2.871333599090576, "rewards/reward_fn/std": 0.01481009740382433, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 212.3125, "completions/mean_terminated_length": 212.3125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.06969343375411054, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.04696744668763131, "learning_rate": 7.737599999999999e-06, "loss": 0.0019, "num_tokens": 29318705.0, "reward": 2.895874500274658, "reward_std": 0.022386854514479637, "rewards/reward_fn/mean": 2.895874500274658, "rewards/reward_fn/std": 0.022386867552995682, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 265.46875, "completions/mean_terminated_length": 265.46875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.06979951203988544, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.02159346017288044, "learning_rate": 7.7372e-06, "loss": 0.0009, "num_tokens": 29347392.0, "reward": 2.750436305999756, "reward_std": 0.03207426145672798, "rewards/reward_fn/mean": 2.750436305999756, "rewards/reward_fn/std": 0.032074298709630966, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 184.375, "completions/mean_terminated_length": 184.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.06990559032566034, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.04227348789572716, "learning_rate": 7.736799999999998e-06, "loss": 0.0017, "num_tokens": 29370412.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 291.28125, "completions/mean_terminated_length": 291.28125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.07001166861143523, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.032787035626824945, "learning_rate": 7.7364e-06, "loss": 0.0013, "num_tokens": 29416629.0, "reward": 3.9628915786743164, "reward_std": 0.20991654694080353, "rewards/reward_fn/mean": 3.9628915786743164, "rewards/reward_fn/std": 0.20991650223731995, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 150.71875, "completions/mean_terminated_length": 150.71875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.07011774689721013, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.05282619083300233, "learning_rate": 7.736e-06, "loss": 0.0021, "num_tokens": 29451724.0, "reward": 2.8188912868499756, "reward_std": 0.01984592340886593, "rewards/reward_fn/mean": 2.8188912868499756, "rewards/reward_fn/std": 0.01984594017267227, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 85.78125, "completions/mean_terminated_length": 85.78125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.07022382518298505, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.023483653378207237, "learning_rate": 7.7356e-06, "loss": 0.0009, "num_tokens": 29482821.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 96.3125, "completions/mean_terminated_length": 96.3125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.07032990346875995, "frac_reward_zero_std": 0.0, "grad_norm": 3.71875, "kl": 0.04963029740611091, "learning_rate": 7.7352e-06, "loss": 0.002, "num_tokens": 29530031.0, "reward": 3.210749626159668, "reward_std": 0.009940498508512974, "rewards/reward_fn/mean": 3.210749626159668, "rewards/reward_fn/std": 0.009940499439835548, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 191.4375, "completions/mean_terminated_length": 191.4375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.07043598175453485, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.05990648630540818, "learning_rate": 7.7348e-06, "loss": 0.0024, "num_tokens": 29554973.0, "reward": 3.537442207336426, "reward_std": 0.6495077013969421, "rewards/reward_fn/mean": 3.537442207336426, "rewards/reward_fn/std": 0.6495076417922974, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 142.03125, "completions/mean_terminated_length": 142.03125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.07054206004030975, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.0415572501369752, "learning_rate": 7.7344e-06, "loss": 0.0017, "num_tokens": 29608254.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 207.09375, "completions/mean_terminated_length": 207.09375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.07064813832608464, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.03682469832710922, "learning_rate": 7.733999999999999e-06, "loss": 0.0015, "num_tokens": 29655361.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.07075421661185956, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0421582740964368, "learning_rate": 7.7336e-06, "loss": 0.0017, "num_tokens": 29678721.0, "reward": 3.49739933013916, "reward_std": 0.45667457580566406, "rewards/reward_fn/mean": 3.49739933013916, "rewards/reward_fn/std": 0.45667460560798645, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 133.1875, "completions/mean_terminated_length": 133.1875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.07086029489763446, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.0935792492236942, "learning_rate": 7.733199999999999e-06, "loss": 0.0037, "num_tokens": 29700839.0, "reward": 2.8479533195495605, "reward_std": 0.30300331115722656, "rewards/reward_fn/mean": 2.8479533195495605, "rewards/reward_fn/std": 0.3030032813549042, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 351.03125, "completions/mean_terminated_length": 351.03125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.07096637318340936, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.06572336354292929, "learning_rate": 7.7328e-06, "loss": 0.0026, "num_tokens": 29755400.0, "reward": 3.5044994354248047, "reward_std": 0.5040919184684753, "rewards/reward_fn/mean": 3.5044994354248047, "rewards/reward_fn/std": 0.5040919184684753, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 167.46875, "completions/mean_terminated_length": 167.46875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.07107245146918426, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.05414850829401985, "learning_rate": 7.732399999999999e-06, "loss": 0.0022, "num_tokens": 29807191.0, "reward": 3.538360118865967, "reward_std": 0.5391759276390076, "rewards/reward_fn/mean": 3.538360118865967, "rewards/reward_fn/std": 0.5391759276390076, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 133.21875, "completions/mean_terminated_length": 133.21875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.07117852975495916, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.1002915867138654, "learning_rate": 7.732e-06, "loss": 0.004, "num_tokens": 29841758.0, "reward": 2.857252359390259, "reward_std": 0.05256405472755432, "rewards/reward_fn/mean": 2.857252359390259, "rewards/reward_fn/std": 0.05256406217813492, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 627.9375, "completions/mean_terminated_length": 627.9375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.07128460804073407, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.0414414910483174, "learning_rate": 7.7316e-06, "loss": 0.0017, "num_tokens": 29897244.0, "reward": 2.651923656463623, "reward_std": 0.30288705229759216, "rewards/reward_fn/mean": 2.651923656463623, "rewards/reward_fn/std": 0.3028870224952698, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 243.25, "completions/mean_terminated_length": 243.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.07139068632650897, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.06160746526438743, "learning_rate": 7.7312e-06, "loss": 0.0025, "num_tokens": 29937188.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.07149676461228387, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.043393855332396924, "learning_rate": 7.7308e-06, "loss": 0.0017, "num_tokens": 29983494.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 206.09375, "completions/mean_terminated_length": 206.09375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.07160284289805877, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.05433402652852237, "learning_rate": 7.7304e-06, "loss": 0.0022, "num_tokens": 30029865.0, "reward": 3.931962013244629, "reward_std": 0.3848804235458374, "rewards/reward_fn/mean": 3.931962013244629, "rewards/reward_fn/std": 0.3848804533481598, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 220.5625, "completions/mean_terminated_length": 220.5625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.07170892118383367, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.028526412788778543, "learning_rate": 7.73e-06, "loss": 0.0011, "num_tokens": 30070107.0, "reward": 2.9268527030944824, "reward_std": 0.03191540390253067, "rewards/reward_fn/mean": 2.9268527030944824, "rewards/reward_fn/std": 0.03191535919904709, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 342.78125, "completions/mean_terminated_length": 342.78125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.07181499946960856, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.05647675262298435, "learning_rate": 7.7296e-06, "loss": 0.0023, "num_tokens": 30117972.0, "reward": 2.998821258544922, "reward_std": 0.23165473341941833, "rewards/reward_fn/mean": 2.998821258544922, "rewards/reward_fn/std": 0.23165474832057953, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 211.78125, "completions/mean_terminated_length": 211.78125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.07192107775538348, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.04946940788067877, "learning_rate": 7.729199999999999e-06, "loss": 0.002, "num_tokens": 30154189.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.07202715604115838, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.03135518968338147, "learning_rate": 7.7288e-06, "loss": 0.0013, "num_tokens": 30210497.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 220.0625, "completions/mean_terminated_length": 220.0625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.07213323432693328, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.027862557559274137, "learning_rate": 7.728399999999999e-06, "loss": 0.0011, "num_tokens": 30259875.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 206.0, "completions/mean_terminated_length": 206.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.07223931261270818, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.051036402059253305, "learning_rate": 7.728e-06, "loss": 0.002, "num_tokens": 30320515.0, "reward": 2.9141030311584473, "reward_std": 0.038106195628643036, "rewards/reward_fn/mean": 2.9141030311584473, "rewards/reward_fn/std": 0.03810620680451393, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 159.53125, "completions/mean_terminated_length": 159.53125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.07234539089848308, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.032492958940565586, "learning_rate": 7.727599999999999e-06, "loss": 0.0013, "num_tokens": 30374452.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 113.15625, "completions/mean_terminated_length": 113.15625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.07245146918425799, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.048680256586521864, "learning_rate": 7.7272e-06, "loss": 0.0019, "num_tokens": 30412665.0, "reward": 3.208629608154297, "reward_std": 0.022559581324458122, "rewards/reward_fn/mean": 3.208629608154297, "rewards/reward_fn/std": 0.022559557110071182, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 180.65625, "completions/mean_terminated_length": 180.65625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.07255754747003289, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.06957442464772612, "learning_rate": 7.7268e-06, "loss": 0.0028, "num_tokens": 30458638.0, "reward": 2.8663017749786377, "reward_std": 0.034207750111818314, "rewards/reward_fn/mean": 2.8663017749786377, "rewards/reward_fn/std": 0.034207772463560104, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 336.78125, "completions/mean_terminated_length": 336.78125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.07266362575580779, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.057023753644898534, "learning_rate": 7.7264e-06, "loss": 0.0023, "num_tokens": 30518407.0, "reward": 3.330765962600708, "reward_std": 0.5284169316291809, "rewards/reward_fn/mean": 3.330765962600708, "rewards/reward_fn/std": 0.5284168720245361, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 537.96875, "completions/mean_terminated_length": 537.96875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.07276970404158269, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.04859426280017942, "learning_rate": 7.726e-06, "loss": 0.0019, "num_tokens": 30574982.0, "reward": 3.4425101280212402, "reward_std": 0.5336366891860962, "rewards/reward_fn/mean": 3.4425101280212402, "rewards/reward_fn/std": 0.5336366295814514, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 158.15625, "completions/mean_terminated_length": 158.15625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.07287578232735759, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.03376417566323653, "learning_rate": 7.7256e-06, "loss": 0.0014, "num_tokens": 30608203.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 183.59375, "completions/mean_terminated_length": 183.59375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.07298186061313248, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.06158266309648752, "learning_rate": 7.7252e-06, "loss": 0.0025, "num_tokens": 30652990.0, "reward": 3.9356517791748047, "reward_std": 0.2532157003879547, "rewards/reward_fn/mean": 3.9356517791748047, "rewards/reward_fn/std": 0.2532157301902771, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.0730879388989074, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.07138905744068325, "learning_rate": 7.7248e-06, "loss": 0.0029, "num_tokens": 30705058.0, "reward": 3.8710503578186035, "reward_std": 0.3467211425304413, "rewards/reward_fn/mean": 3.8710503578186035, "rewards/reward_fn/std": 0.3467211425304413, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 208.1875, "completions/mean_terminated_length": 208.1875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.0731940171846823, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.03632815106539056, "learning_rate": 7.7244e-06, "loss": 0.0015, "num_tokens": 30759784.0, "reward": 3.0944759845733643, "reward_std": 0.016841473057866096, "rewards/reward_fn/mean": 3.0944759845733643, "rewards/reward_fn/std": 0.016841456294059753, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 335.9375, "completions/mean_terminated_length": 335.9375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.0733000954704572, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.06270804395899177, "learning_rate": 7.724e-06, "loss": 0.0025, "num_tokens": 30803366.0, "reward": 3.3440566062927246, "reward_std": 0.5181226134300232, "rewards/reward_fn/mean": 3.3440566062927246, "rewards/reward_fn/std": 0.5181225538253784, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 251.875, "completions/mean_terminated_length": 251.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.0734061737562321, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.056056828470900655, "learning_rate": 7.7236e-06, "loss": 0.0022, "num_tokens": 30832258.0, "reward": 3.3849637508392334, "reward_std": 0.8223517537117004, "rewards/reward_fn/mean": 3.3849637508392334, "rewards/reward_fn/std": 0.8223517537117004, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 340.09375, "completions/mean_terminated_length": 340.09375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.073512252042007, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.05376436864025891, "learning_rate": 7.7232e-06, "loss": 0.0022, "num_tokens": 30864485.0, "reward": 3.321343421936035, "reward_std": 0.608305811882019, "rewards/reward_fn/mean": 3.321343421936035, "rewards/reward_fn/std": 0.6083057522773743, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 309.40625, "completions/mean_terminated_length": 309.40625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.07361833032778191, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.041985310381278396, "learning_rate": 7.7228e-06, "loss": 0.0017, "num_tokens": 30887570.0, "reward": 2.8665213584899902, "reward_std": 0.019187498837709427, "rewards/reward_fn/mean": 2.8665213584899902, "rewards/reward_fn/std": 0.019187474623322487, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 185.34375, "completions/mean_terminated_length": 185.34375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.07372440861355681, "frac_reward_zero_std": 0.0, "grad_norm": 3.546875, "kl": 0.05925154301803559, "learning_rate": 7.7224e-06, "loss": 0.0024, "num_tokens": 30929981.0, "reward": 3.384206771850586, "reward_std": 0.5540663599967957, "rewards/reward_fn/mean": 3.384206771850586, "rewards/reward_fn/std": 0.5540663003921509, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.07383048689933171, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.0262971400807146, "learning_rate": 7.722e-06, "loss": 0.0011, "num_tokens": 30984845.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.0739365651851066, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.06998066313099116, "learning_rate": 7.721599999999999e-06, "loss": 0.0028, "num_tokens": 31011685.0, "reward": 2.9877424240112305, "reward_std": 0.3321399986743927, "rewards/reward_fn/mean": 2.9877424240112305, "rewards/reward_fn/std": 0.3321399986743927, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 150.65625, "completions/mean_terminated_length": 150.65625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.0740426434708815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.04288387484848499, "learning_rate": 7.7212e-06, "loss": 0.0017, "num_tokens": 31061082.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 171.75, "completions/mean_terminated_length": 171.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.07414872175665642, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.047392503125593066, "learning_rate": 7.720799999999999e-06, "loss": 0.0019, "num_tokens": 31085266.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 217.84375, "completions/mean_terminated_length": 217.84375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.07425480004243132, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.04449020640458912, "learning_rate": 7.7204e-06, "loss": 0.0018, "num_tokens": 31135949.0, "reward": 2.708951473236084, "reward_std": 0.029957668855786324, "rewards/reward_fn/mean": 2.708951473236084, "rewards/reward_fn/std": 0.029957666993141174, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 94.34375, "completions/mean_terminated_length": 94.34375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.07436087832820622, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.05266049993224442, "learning_rate": 7.719999999999999e-06, "loss": 0.0021, "num_tokens": 31176536.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 132.46875, "completions/mean_terminated_length": 132.46875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.07446695661398112, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.04129998397547752, "learning_rate": 7.7196e-06, "loss": 0.0017, "num_tokens": 31212647.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 120.78125, "completions/mean_terminated_length": 120.78125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.07457303489975602, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.04240784316789359, "learning_rate": 7.719199999999999e-06, "loss": 0.0017, "num_tokens": 31252096.0, "reward": 2.8910398483276367, "reward_std": 0.03602422773838043, "rewards/reward_fn/mean": 2.8910398483276367, "rewards/reward_fn/std": 0.036024242639541626, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 436.875, "completions/mean_terminated_length": 436.875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.07467911318553092, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.059532526414841413, "learning_rate": 7.7188e-06, "loss": 0.0024, "num_tokens": 31302492.0, "reward": 3.1270198822021484, "reward_std": 0.8769363164901733, "rewards/reward_fn/mean": 3.1270198822021484, "rewards/reward_fn/std": 0.8769363164901733, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.07478519147130583, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.04183370858663693, "learning_rate": 7.718399999999999e-06, "loss": 0.0017, "num_tokens": 31344848.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 183.875, "completions/mean_terminated_length": 183.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.07489126975708073, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.044983384606894106, "learning_rate": 7.718e-06, "loss": 0.0018, "num_tokens": 31384556.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 171.9375, "completions/mean_terminated_length": 171.9375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.07499734804285563, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.05026729096425697, "learning_rate": 7.7176e-06, "loss": 0.002, "num_tokens": 31441322.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 169.9375, "completions/mean_terminated_length": 169.9375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.07510342632863053, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.036427939659915864, "learning_rate": 7.7172e-06, "loss": 0.0015, "num_tokens": 31478984.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 136.125, "completions/mean_terminated_length": 136.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.07520950461440543, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.05036911822389811, "learning_rate": 7.7168e-06, "loss": 0.002, "num_tokens": 31513708.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 177.96875, "completions/mean_terminated_length": 177.96875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.07531558290018034, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.06007158639840782, "learning_rate": 7.7164e-06, "loss": 0.0024, "num_tokens": 31555595.0, "reward": 3.960479736328125, "reward_std": 0.15552453696727753, "rewards/reward_fn/mean": 3.960479736328125, "rewards/reward_fn/std": 0.15552456676959991, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 273.90625, "completions/mean_terminated_length": 273.90625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.07542166118595524, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.07482221210375428, "learning_rate": 7.716e-06, "loss": 0.003, "num_tokens": 31601640.0, "reward": 3.256192445755005, "reward_std": 0.5825570225715637, "rewards/reward_fn/mean": 3.256192445755005, "rewards/reward_fn/std": 0.5825570225715637, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 182.46875, "completions/mean_terminated_length": 182.46875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.07552773947173014, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.038130709261167794, "learning_rate": 7.7156e-06, "loss": 0.0015, "num_tokens": 31651319.0, "reward": 3.974000930786133, "reward_std": 0.14707225561141968, "rewards/reward_fn/mean": 3.974000930786133, "rewards/reward_fn/std": 0.1470722258090973, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 180.625, "completions/mean_terminated_length": 180.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.07563381775750504, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.057246467215009034, "learning_rate": 7.7152e-06, "loss": 0.0023, "num_tokens": 31680747.0, "reward": 3.156899929046631, "reward_std": 0.5360119342803955, "rewards/reward_fn/mean": 3.156899929046631, "rewards/reward_fn/std": 0.5360119342803955, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.07573989604327994, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.032892783579882234, "learning_rate": 7.7148e-06, "loss": 0.0013, "num_tokens": 31719331.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 139.5, "completions/mean_terminated_length": 139.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.07584597432905485, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.02656198089243844, "learning_rate": 7.7144e-06, "loss": 0.0011, "num_tokens": 31745331.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.07595205261482975, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.036677059542853385, "learning_rate": 7.714e-06, "loss": 0.0015, "num_tokens": 31782683.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 198.65625, "completions/mean_terminated_length": 198.65625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.07605813090060465, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.046608570439275354, "learning_rate": 7.713599999999998e-06, "loss": 0.0019, "num_tokens": 31828976.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 238.84375, "completions/mean_terminated_length": 238.84375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.07616420918637955, "frac_reward_zero_std": 1.0, "grad_norm": 0.3515625, "kl": 0.06297203176654875, "learning_rate": 7.7132e-06, "loss": 0.0025, "num_tokens": 31879499.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 283.40625, "completions/mean_terminated_length": 283.40625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.07627028747215445, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.0723530335817486, "learning_rate": 7.7128e-06, "loss": 0.0029, "num_tokens": 31919480.0, "reward": 2.887519359588623, "reward_std": 0.4284627139568329, "rewards/reward_fn/mean": 2.887519359588623, "rewards/reward_fn/std": 0.4284627139568329, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 70.34375, "completions/mean_terminated_length": 70.34375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.07637636575792935, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.03475224046269432, "learning_rate": 7.7124e-06, "loss": 0.0014, "num_tokens": 31959587.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 312.59375, "completions/mean_terminated_length": 312.59375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.07648244404370426, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.0463191430317238, "learning_rate": 7.712e-06, "loss": 0.0019, "num_tokens": 32017942.0, "reward": 3.9643290042877197, "reward_std": 0.20178602635860443, "rewards/reward_fn/mean": 3.9643290042877197, "rewards/reward_fn/std": 0.20178604125976562, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 295.0625, "completions/mean_terminated_length": 295.0625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.07658852232947916, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.04168545902939513, "learning_rate": 7.711599999999999e-06, "loss": 0.0017, "num_tokens": 32067992.0, "reward": 2.7265396118164062, "reward_std": 0.033310580998659134, "rewards/reward_fn/mean": 2.7265396118164062, "rewards/reward_fn/std": 0.03331058472394943, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 258.5, "completions/mean_terminated_length": 258.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.07669460061525406, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.029477198084350675, "learning_rate": 7.7112e-06, "loss": 0.0012, "num_tokens": 32112904.0, "reward": 1.7350808382034302, "reward_std": 0.022046852856874466, "rewards/reward_fn/mean": 1.7350808382034302, "rewards/reward_fn/std": 0.022046852856874466, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.07680067890102896, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.04448731255251914, "learning_rate": 7.710799999999999e-06, "loss": 0.0018, "num_tokens": 32149416.0, "reward": 2.949578285217285, "reward_std": 0.004209110513329506, "rewards/reward_fn/mean": 2.949578285217285, "rewards/reward_fn/std": 0.00420913752168417, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 343.71875, "completions/mean_terminated_length": 343.71875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.07690675718680386, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.07948042592033744, "learning_rate": 7.7104e-06, "loss": 0.0032, "num_tokens": 32209151.0, "reward": 2.8511264324188232, "reward_std": 0.054016876965761185, "rewards/reward_fn/mean": 2.8511264324188232, "rewards/reward_fn/std": 0.054016903042793274, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 248.09375, "completions/mean_terminated_length": 248.09375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.07701283547257877, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.0459744130494073, "learning_rate": 7.709999999999999e-06, "loss": 0.0018, "num_tokens": 32258082.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 289.59375, "completions/mean_terminated_length": 289.59375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.07711891375835367, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.05580032308353111, "learning_rate": 7.7096e-06, "loss": 0.0022, "num_tokens": 32311349.0, "reward": 2.961806297302246, "reward_std": 0.3445666432380676, "rewards/reward_fn/mean": 2.961806297302246, "rewards/reward_fn/std": 0.34456658363342285, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 220.6875, "completions/mean_terminated_length": 220.6875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.07722499204412857, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.037108064629137516, "learning_rate": 7.709199999999999e-06, "loss": 0.0015, "num_tokens": 32359083.0, "reward": 3.6539883613586426, "reward_std": 0.5620724558830261, "rewards/reward_fn/mean": 3.6539883613586426, "rewards/reward_fn/std": 0.5620723962783813, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 319.0625, "completions/mean_terminated_length": 319.0625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.07733107032990347, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.06664447928778827, "learning_rate": 7.7088e-06, "loss": 0.0027, "num_tokens": 32406445.0, "reward": 2.7638401985168457, "reward_std": 0.045684244483709335, "rewards/reward_fn/mean": 2.7638401985168457, "rewards/reward_fn/std": 0.045684244483709335, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 186.78125, "completions/mean_terminated_length": 186.78125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.07743714861567837, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.07360926462570205, "learning_rate": 7.7084e-06, "loss": 0.0029, "num_tokens": 32430950.0, "reward": 3.3820438385009766, "reward_std": 0.6281734704971313, "rewards/reward_fn/mean": 3.3820438385009766, "rewards/reward_fn/std": 0.6281735301017761, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 174.9375, "completions/mean_terminated_length": 174.9375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.07754322690145327, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.07646584114991128, "learning_rate": 7.708e-06, "loss": 0.0031, "num_tokens": 32479588.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 81.25, "completions/mean_terminated_length": 81.25, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.07764930518722818, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.022484338143840432, "learning_rate": 7.7076e-06, "loss": 0.0009, "num_tokens": 32502924.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.07775538347300308, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.07375269406475127, "learning_rate": 7.7072e-06, "loss": 0.003, "num_tokens": 32550960.0, "reward": 3.9716763496398926, "reward_std": 0.16022291779518127, "rewards/reward_fn/mean": 3.9716763496398926, "rewards/reward_fn/std": 0.16022291779518127, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 194.03125, "completions/mean_terminated_length": 194.03125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.07786146175877798, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.052297455724328756, "learning_rate": 7.7068e-06, "loss": 0.0021, "num_tokens": 32592881.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 383.125, "completions/mean_terminated_length": 383.125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.07796754004455288, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.04508164117578417, "learning_rate": 7.7064e-06, "loss": 0.0018, "num_tokens": 32650485.0, "reward": 3.0846705436706543, "reward_std": 0.07166870683431625, "rewards/reward_fn/mean": 3.0846705436706543, "rewards/reward_fn/std": 0.07166869193315506, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 84.71875, "completions/mean_terminated_length": 84.71875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.07807361833032778, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.025161666912026703, "learning_rate": 7.706e-06, "loss": 0.001, "num_tokens": 32696460.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 200.65625, "completions/mean_terminated_length": 200.65625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.07817969661610269, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.044102601415943354, "learning_rate": 7.7056e-06, "loss": 0.0018, "num_tokens": 32737377.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.07828577490187759, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.04195326648186892, "learning_rate": 7.705199999999999e-06, "loss": 0.0017, "num_tokens": 32762997.0, "reward": 2.9239230155944824, "reward_std": 0.020110029727220535, "rewards/reward_fn/mean": 2.9239230155944824, "rewards/reward_fn/std": 0.02011003904044628, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 279.125, "completions/mean_terminated_length": 279.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.07839185318765249, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.048967642593197525, "learning_rate": 7.7048e-06, "loss": 0.002, "num_tokens": 32810553.0, "reward": 3.0665769577026367, "reward_std": 0.17190836369991302, "rewards/reward_fn/mean": 3.0665769577026367, "rewards/reward_fn/std": 0.17190837860107422, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 242.21875, "completions/mean_terminated_length": 242.21875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.07849793147342739, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.05765526695176959, "learning_rate": 7.704399999999999e-06, "loss": 0.0023, "num_tokens": 32882304.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 119.625, "completions/mean_terminated_length": 119.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.07860400975920229, "frac_reward_zero_std": 0.0, "grad_norm": 3.34375, "kl": 0.05220557638676837, "learning_rate": 7.704e-06, "loss": 0.0021, "num_tokens": 32919092.0, "reward": 3.2061731815338135, "reward_std": 0.0629623532295227, "rewards/reward_fn/mean": 3.2061731815338135, "rewards/reward_fn/std": 0.06296232342720032, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 164.40625, "completions/mean_terminated_length": 164.40625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.0787100880449772, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.04799942090176046, "learning_rate": 7.7036e-06, "loss": 0.0019, "num_tokens": 32954721.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 217.0625, "completions/mean_terminated_length": 217.0625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.0788161663307521, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.04830415640026331, "learning_rate": 7.7032e-06, "loss": 0.0019, "num_tokens": 33002563.0, "reward": 3.9048845767974854, "reward_std": 0.3005771040916443, "rewards/reward_fn/mean": 3.9048845767974854, "rewards/reward_fn/std": 0.3005771338939667, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 330.90625, "completions/mean_terminated_length": 330.90625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.078922244616527, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.05085050774505362, "learning_rate": 7.7028e-06, "loss": 0.002, "num_tokens": 33033440.0, "reward": 2.793060302734375, "reward_std": 0.5311539173126221, "rewards/reward_fn/mean": 2.793060302734375, "rewards/reward_fn/std": 0.5311539173126221, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 99.71875, "completions/mean_terminated_length": 99.71875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.0790283229023019, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.019555108272470534, "learning_rate": 7.7024e-06, "loss": 0.0008, "num_tokens": 33075415.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 207.15625, "completions/mean_terminated_length": 207.15625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.0791344011880768, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.03279089758871123, "learning_rate": 7.702e-06, "loss": 0.0013, "num_tokens": 33118108.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 206.4375, "completions/mean_terminated_length": 206.4375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.0792404794738517, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.04685262369457632, "learning_rate": 7.7016e-06, "loss": 0.0019, "num_tokens": 33146314.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 149.9375, "completions/mean_terminated_length": 149.9375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.07934655775962661, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.06066811521304771, "learning_rate": 7.7012e-06, "loss": 0.0024, "num_tokens": 33171688.0, "reward": 3.004272699356079, "reward_std": 0.037238411605358124, "rewards/reward_fn/mean": 3.004272699356079, "rewards/reward_fn/std": 0.037238407880067825, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 112.84375, "completions/mean_terminated_length": 112.84375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.07945263604540151, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.07324639323633164, "learning_rate": 7.7008e-06, "loss": 0.0029, "num_tokens": 33221411.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 129.59375, "completions/mean_terminated_length": 129.59375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.07955871433117641, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.023909973591798916, "learning_rate": 7.7004e-06, "loss": 0.001, "num_tokens": 33269782.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 170.65625, "completions/mean_terminated_length": 170.65625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.07966479261695131, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.06489493430126458, "learning_rate": 7.699999999999999e-06, "loss": 0.0026, "num_tokens": 33311339.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 208.28125, "completions/mean_terminated_length": 208.28125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.07977087090272621, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.06098278262652457, "learning_rate": 7.6996e-06, "loss": 0.0024, "num_tokens": 33369396.0, "reward": 3.959812641143799, "reward_std": 0.22733472287654877, "rewards/reward_fn/mean": 3.959812641143799, "rewards/reward_fn/std": 0.22733475267887115, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 113.96875, "completions/mean_terminated_length": 113.96875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.07987694918850112, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.04324968159198761, "learning_rate": 7.6992e-06, "loss": 0.0017, "num_tokens": 33407827.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 557.0, "completions/mean_terminated_length": 557.0, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.07998302747427602, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.04723305656807497, "learning_rate": 7.6988e-06, "loss": 0.0019, "num_tokens": 33470643.0, "reward": 2.8307480812072754, "reward_std": 0.05510137230157852, "rewards/reward_fn/mean": 2.8307480812072754, "rewards/reward_fn/std": 0.05510134622454643, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 96.25, "completions/mean_terminated_length": 96.25, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.08008910576005092, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.03696786391083151, "learning_rate": 7.6984e-06, "loss": 0.0015, "num_tokens": 33511419.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 142.75, "completions/mean_terminated_length": 142.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.08019518404582582, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.04854514845646918, "learning_rate": 7.698e-06, "loss": 0.0019, "num_tokens": 33545587.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 175.09375, "completions/mean_terminated_length": 175.09375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.08030126233160072, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.039331718639004976, "learning_rate": 7.6976e-06, "loss": 0.0016, "num_tokens": 33587734.0, "reward": 2.9345703125, "reward_std": 0.014557859860360622, "rewards/reward_fn/mean": 2.9345703125, "rewards/reward_fn/std": 0.014557869173586369, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 176.34375, "completions/mean_terminated_length": 176.34375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.08040734061737562, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.039623707241844386, "learning_rate": 7.6972e-06, "loss": 0.0016, "num_tokens": 33611521.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.08051341890315053, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.05152698885649443, "learning_rate": 7.696799999999999e-06, "loss": 0.0021, "num_tokens": 33652117.0, "reward": 3.8735275268554688, "reward_std": 0.2986966669559479, "rewards/reward_fn/mean": 3.8735275268554688, "rewards/reward_fn/std": 0.2986966669559479, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 265.4375, "completions/mean_terminated_length": 265.4375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.08061949718892543, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.04387016425607726, "learning_rate": 7.6964e-06, "loss": 0.0018, "num_tokens": 33682211.0, "reward": 2.870743751525879, "reward_std": 0.029427386820316315, "rewards/reward_fn/mean": 2.870743751525879, "rewards/reward_fn/std": 0.029427384957671165, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 336.125, "completions/mean_terminated_length": 336.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.08072557547470033, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.04809122672304511, "learning_rate": 7.695999999999999e-06, "loss": 0.0019, "num_tokens": 33731975.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 180.53125, "completions/mean_terminated_length": 180.53125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.08083165376047523, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0938791740918532, "learning_rate": 7.6956e-06, "loss": 0.0038, "num_tokens": 33758904.0, "reward": 3.966001272201538, "reward_std": 0.19232597947120667, "rewards/reward_fn/mean": 3.966001272201538, "rewards/reward_fn/std": 0.19232596457004547, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 293.84375, "completions/mean_terminated_length": 293.84375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.08093773204625013, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.1058124101255089, "learning_rate": 7.695199999999999e-06, "loss": 0.0042, "num_tokens": 33796659.0, "reward": 2.5515646934509277, "reward_std": 0.5192806720733643, "rewards/reward_fn/mean": 2.5515646934509277, "rewards/reward_fn/std": 0.5192806720733643, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 436.25, "completions/mean_terminated_length": 436.25, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.08104381033202504, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.061225235112942755, "learning_rate": 7.6948e-06, "loss": 0.0024, "num_tokens": 33843131.0, "reward": 2.933668375015259, "reward_std": 0.052915628999471664, "rewards/reward_fn/mean": 2.933668375015259, "rewards/reward_fn/std": 0.05291564017534256, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 230.96875, "completions/mean_terminated_length": 230.96875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.08114988861779994, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.050095128593966365, "learning_rate": 7.6944e-06, "loss": 0.002, "num_tokens": 33868250.0, "reward": 3.302189350128174, "reward_std": 0.039182018488645554, "rewards/reward_fn/mean": 3.302189350128174, "rewards/reward_fn/std": 0.03918198496103287, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 231.65625, "completions/mean_terminated_length": 231.65625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.08125596690357484, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.03903214994352311, "learning_rate": 7.694e-06, "loss": 0.0016, "num_tokens": 33926703.0, "reward": 3.2802817821502686, "reward_std": 0.5692557096481323, "rewards/reward_fn/mean": 3.2802817821502686, "rewards/reward_fn/std": 0.5692556500434875, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 307.5625, "completions/mean_terminated_length": 307.5625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.08136204518934974, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.03617817087797448, "learning_rate": 7.6936e-06, "loss": 0.0014, "num_tokens": 33986145.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 385.15625, "completions/mean_terminated_length": 385.15625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.08146812347512464, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.07895687408745289, "learning_rate": 7.6932e-06, "loss": 0.0032, "num_tokens": 34029030.0, "reward": 3.0626044273376465, "reward_std": 0.08735579997301102, "rewards/reward_fn/mean": 3.0626044273376465, "rewards/reward_fn/std": 0.08735582232475281, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.08157420176089955, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.0394937681267038, "learning_rate": 7.6928e-06, "loss": 0.0016, "num_tokens": 34062274.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 192.40625, "completions/mean_terminated_length": 192.40625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.08168028004667445, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.05581576417898759, "learning_rate": 7.6924e-06, "loss": 0.0022, "num_tokens": 34101487.0, "reward": 2.8739709854125977, "reward_std": 0.07851967960596085, "rewards/reward_fn/mean": 2.8739709854125977, "rewards/reward_fn/std": 0.07851970195770264, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 137.15625, "completions/mean_terminated_length": 137.15625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.08178635833244935, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.047419965849258006, "learning_rate": 7.692e-06, "loss": 0.0019, "num_tokens": 34126324.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 207.03125, "completions/mean_terminated_length": 207.03125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.08189243661822425, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.02880560705671087, "learning_rate": 7.6916e-06, "loss": 0.0012, "num_tokens": 34164597.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 262.96875, "completions/mean_terminated_length": 262.96875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.08199851490399915, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.034354471892584115, "learning_rate": 7.6912e-06, "loss": 0.0014, "num_tokens": 34206228.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 174.96875, "completions/mean_terminated_length": 174.96875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.08210459318977405, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.03377506849938072, "learning_rate": 7.6908e-06, "loss": 0.0014, "num_tokens": 34263987.0, "reward": 3.027437686920166, "reward_std": 0.023412281647324562, "rewards/reward_fn/mean": 3.027437686920166, "rewards/reward_fn/std": 0.023412277922034264, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 188.4375, "completions/mean_terminated_length": 188.4375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.08221067147554896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.03890780231449753, "learning_rate": 7.6904e-06, "loss": 0.0016, "num_tokens": 34304385.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 283.65625, "completions/mean_terminated_length": 283.65625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.08231674976132386, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.03191797385807149, "learning_rate": 7.69e-06, "loss": 0.0013, "num_tokens": 34345942.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.08242282804709876, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.06634008465334773, "learning_rate": 7.6896e-06, "loss": 0.0026, "num_tokens": 34385514.0, "reward": 2.8560264110565186, "reward_std": 0.024654332548379898, "rewards/reward_fn/mean": 2.8560264110565186, "rewards/reward_fn/std": 0.024654347449541092, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 202.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.08252890633287366, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.07904170651454479, "learning_rate": 7.6892e-06, "loss": 0.0032, "num_tokens": 34429882.0, "reward": 3.814371109008789, "reward_std": 0.39530250430107117, "rewards/reward_fn/mean": 3.814371109008789, "rewards/reward_fn/std": 0.3953024744987488, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 86.90625, "completions/mean_terminated_length": 86.90625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.08263498461864856, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.021457875292981043, "learning_rate": 7.6888e-06, "loss": 0.0009, "num_tokens": 34468695.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 328.5, "completions/mean_terminated_length": 328.5, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.08274106290442347, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.054818932549096644, "learning_rate": 7.688399999999999e-06, "loss": 0.0022, "num_tokens": 34513159.0, "reward": 2.7019734382629395, "reward_std": 0.16798020899295807, "rewards/reward_fn/mean": 2.7019734382629395, "rewards/reward_fn/std": 0.16798023879528046, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.08284714119019837, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.06239066878333688, "learning_rate": 7.688e-06, "loss": 0.0025, "num_tokens": 34552759.0, "reward": 2.781367301940918, "reward_std": 0.028873471543192863, "rewards/reward_fn/mean": 2.781367301940918, "rewards/reward_fn/std": 0.028873484581708908, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 118.96875, "completions/mean_terminated_length": 118.96875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.08295321947597327, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.013791139441309497, "learning_rate": 7.687599999999999e-06, "loss": 0.0005, "num_tokens": 34589942.0, "reward": 3.1045916080474854, "reward_std": 0.0047982302494347095, "rewards/reward_fn/mean": 3.1045916080474854, "rewards/reward_fn/std": 0.004798218607902527, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 412.84375, "completions/mean_terminated_length": 412.84375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.08305929776174817, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.03246723604388535, "learning_rate": 7.6872e-06, "loss": 0.0013, "num_tokens": 34660113.0, "reward": 3.670332908630371, "reward_std": 0.7970924377441406, "rewards/reward_fn/mean": 3.670332908630371, "rewards/reward_fn/std": 0.7970924377441406, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 240.1875, "completions/mean_terminated_length": 240.1875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.08316537604752307, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.05126189975999296, "learning_rate": 7.686799999999999e-06, "loss": 0.0021, "num_tokens": 34686967.0, "reward": 3.855468988418579, "reward_std": 0.3885265290737152, "rewards/reward_fn/mean": 3.855468988418579, "rewards/reward_fn/std": 0.3885265290737152, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 725.59375, "completions/mean_terminated_length": 725.59375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.08327145433329797, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.08870382863096893, "learning_rate": 7.6864e-06, "loss": 0.0035, "num_tokens": 34748234.0, "reward": 2.293332815170288, "reward_std": 0.5246787071228027, "rewards/reward_fn/mean": 2.293332815170288, "rewards/reward_fn/std": 0.5246787667274475, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 200.59375, "completions/mean_terminated_length": 200.59375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.08337753261907288, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.04654703033156693, "learning_rate": 7.685999999999999e-06, "loss": 0.0019, "num_tokens": 34801277.0, "reward": 3.829470157623291, "reward_std": 0.36089715361595154, "rewards/reward_fn/mean": 3.829470157623291, "rewards/reward_fn/std": 0.3608972132205963, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 158.9375, "completions/mean_terminated_length": 158.9375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.08348361090484778, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.07684183481615037, "learning_rate": 7.6856e-06, "loss": 0.0031, "num_tokens": 34840795.0, "reward": 2.852140188217163, "reward_std": 0.03964653238654137, "rewards/reward_fn/mean": 2.852140188217163, "rewards/reward_fn/std": 0.03964650258421898, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 187.96875, "completions/mean_terminated_length": 187.96875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.08358968919062268, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.048744586820248514, "learning_rate": 7.685199999999999e-06, "loss": 0.0019, "num_tokens": 34878458.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 150.53125, "completions/mean_terminated_length": 150.53125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.08369576747639758, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.04753570049069822, "learning_rate": 7.6848e-06, "loss": 0.0019, "num_tokens": 34916299.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 283.9375, "completions/mean_terminated_length": 283.9375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.08380184576217248, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.09668770106509328, "learning_rate": 7.6844e-06, "loss": 0.0039, "num_tokens": 34974889.0, "reward": 2.762829303741455, "reward_std": 0.036893703043460846, "rewards/reward_fn/mean": 2.762829303741455, "rewards/reward_fn/std": 0.03689371794462204, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 400.75, "completions/mean_terminated_length": 400.75, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.08390792404794739, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.03603732128976844, "learning_rate": 7.684e-06, "loss": 0.0014, "num_tokens": 35032673.0, "reward": 3.586613178253174, "reward_std": 0.6132650375366211, "rewards/reward_fn/mean": 3.586613178253174, "rewards/reward_fn/std": 0.6132650375366211, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 100.9375, "completions/mean_terminated_length": 100.9375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.08401400233372229, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.05937142693437636, "learning_rate": 7.6836e-06, "loss": 0.0024, "num_tokens": 35063263.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 151.46875, "completions/mean_terminated_length": 151.46875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.08412008061949719, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.06238317512907088, "learning_rate": 7.6832e-06, "loss": 0.0025, "num_tokens": 35102414.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 181.90625, "completions/mean_terminated_length": 181.90625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.08422615890527209, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.0344988465658389, "learning_rate": 7.6828e-06, "loss": 0.0014, "num_tokens": 35141195.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 249.6875, "completions/mean_terminated_length": 249.6875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.08433223719104699, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.05292107805144042, "learning_rate": 7.6824e-06, "loss": 0.0021, "num_tokens": 35184545.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 381.375, "completions/mean_terminated_length": 381.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.0844383154768219, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.082465800922364, "learning_rate": 7.682e-06, "loss": 0.0033, "num_tokens": 35226829.0, "reward": 2.704169750213623, "reward_std": 0.029407240450382233, "rewards/reward_fn/mean": 2.704169750213623, "rewards/reward_fn/std": 0.02940722554922104, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 429.0, "completions/mean_terminated_length": 429.0, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.0845443937625968, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.08249839709606022, "learning_rate": 7.6816e-06, "loss": 0.0033, "num_tokens": 35277645.0, "reward": 2.570937156677246, "reward_std": 0.46163350343704224, "rewards/reward_fn/mean": 2.570937156677246, "rewards/reward_fn/std": 0.46163344383239746, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.0846504720483717, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.052692751720314845, "learning_rate": 7.681199999999999e-06, "loss": 0.0021, "num_tokens": 35307957.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 246.03125, "completions/mean_terminated_length": 246.03125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.0847565503341466, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.049287278146948665, "learning_rate": 7.6808e-06, "loss": 0.002, "num_tokens": 35354038.0, "reward": 2.8469812870025635, "reward_std": 0.30314865708351135, "rewards/reward_fn/mean": 2.8469812870025635, "rewards/reward_fn/std": 0.30314865708351135, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 194.96875, "completions/mean_terminated_length": 194.96875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.0848626286199215, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.06779409275623038, "learning_rate": 7.680399999999998e-06, "loss": 0.0027, "num_tokens": 35391893.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 800 } ], "logging_steps": 1, "max_steps": 20000, "num_input_tokens_seen": 35391893, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }